فهرست منبع

修复部署后突然挂掉的bug

lxylxy123321 2 روز پیش
والد
کامیت
2525dcfcff
3فایلهای تغییر یافته به همراه140 افزوده شده و 143 حذف شده
  1. 11 7
      backend/app/core/remote_executor.py
  2. 3 2
      backend/app/services/deploy_service.py
  3. 126 134
      result.txt

+ 11 - 7
backend/app/core/remote_executor.py

@@ -205,16 +205,20 @@ def run_training_remote(
 
 
 def is_process_running(pid: str) -> bool:
-    """检查远程训练进程是否还在运行。
+    """检查远程训练/推理进程是否还在运行。
 
-    通过 docker exec 进入容器检查 PID 是否存在且不是僵尸进程。
+    通过 docker exec 进入容器,优先用 kill -0 检查指定 PID,
+    兜底用 ps 检查是否存在匹配的 Python 进程。
     """
     cmd = (
-        f"docker exec {settings.compute_node_docker_container} bash -c "
-        f"'state=$(cat /proc/{pid}/stat 2>/dev/null | awk \"{{{{print \\$3}}}}\"); "
-        f"if [ \"$state\" = \"Z\" ]; then echo zombie; "
-        f"elif kill -0 {pid} 2>/dev/null; then echo running; "
-        f"else echo stopped; fi'"
+        f"docker exec {settings.compute_node_docker_container} bash -c '"
+        f"if kill -0 {pid} 2>/dev/null; then "
+        f"  state=$(cat /proc/{pid}/stat 2>/dev/null | awk \"{{{{print \\$3}}}}\"); "
+        f"  if [ \"$state\" = \"Z\" ]; then echo stopped; else echo running; "
+        f"  fi; "
+        f"else "
+        f"  echo stopped; "
+        f"fi'"
     )
     code, stdout, stderr = ssh_exec(cmd, timeout=30)
     if code != 0:

+ 3 - 2
backend/app/services/deploy_service.py

@@ -235,6 +235,7 @@ async def _launch_remote_worker(task_id: str, model_path: str, port: int) -> str
         raise RuntimeError(f"复制 inference_worker.py 失败: {stderr}")
 
     # 在容器内后台启动 worker
+    # 使用 exec 让 Python 进程直接占用 PID,避免 setsid session leader PID 不匹配
     launch_cmd = (
         f"docker exec "
         f"-e MACA_MPS_MODE=1 "
@@ -242,11 +243,11 @@ async def _launch_remote_worker(task_id: str, model_path: str, port: int) -> str
         f"-w {model_path} "
         f"{settings.compute_node_docker_container} "
         f"bash -c '"
-        f"setsid {settings.compute_node_python} inference_worker.py "
+        f"nohup {settings.compute_node_python} inference_worker.py "
         f"--model-path {model_path} "
         f"--port {port} "
         f"</dev/null >/tmp/serve_{task_id}.log 2>&1 &"
-        f" disown; echo $!'"
+        f" echo $!'"
     )
 
     code, stdout, stderr = ssh_exec(launch_cmd, timeout=30)

+ 126 - 134
result.txt

@@ -1,7 +1,8 @@
-
-ERROR:    Application startup failed. Exiting.
+lq@lq:~/Fine-tuning$ sudo docker logs -f finetune-backend
 => Syncing backend code to compute node 192.168.91.253 ...
+Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
 sending incremental file list
+./
 .dockerignore
 .env.docker
 .env.example
@@ -11,11 +12,12 @@ entrypoint.sh
 main.py
 pyproject.toml
 requirements.txt
-__pycache__/main.cpython-310.pyc
+app/
 app/__init__.py
 app/config.py
 app/__pycache__/__init__.cpython-310.pyc
 app/__pycache__/config.cpython-310.pyc
+app/api/
 app/api/__init__.py
 app/api/api_keys.py
 app/api/auth.py
@@ -36,6 +38,7 @@ app/api/__pycache__/inference.cpython-310.pyc
 app/api/__pycache__/models.cpython-310.pyc
 app/api/__pycache__/sample_center.cpython-310.pyc
 app/api/__pycache__/training.cpython-310.pyc
+app/core/
 app/core/__init__.py
 app/core/auth.py
 app/core/background_tasks.py
@@ -50,6 +53,7 @@ app/core/remote_executor.py
 app/core/security.py
 app/core/sso_client.py
 app/core/websocket.py
+app/core/__pycache__/
 app/core/__pycache__/__init__.cpython-310.pyc
 app/core/__pycache__/auth.cpython-310.pyc
 app/core/__pycache__/background_tasks.cpython-310.pyc
@@ -62,6 +66,7 @@ app/core/__pycache__/remote_executor.cpython-310.pyc
 app/core/__pycache__/security.cpython-310.pyc
 app/core/__pycache__/sso_client.cpython-310.pyc
 app/core/__pycache__/websocket.cpython-310.pyc
+app/engines/
 app/engines/__init__.py
 app/engines/__main__.py
 app/engines/base.py
@@ -73,10 +78,13 @@ app/engines/__pycache__/__init__.cpython-310.pyc
 app/engines/__pycache__/base.cpython-310.pyc
 app/engines/__pycache__/remote_train.cpython-310.pyc
 app/engines/__pycache__/text_engine.cpython-310.pyc
+app/peft/
 app/peft/__init__.py
 app/peft/__pycache__/__init__.cpython-310.pyc
+app/preprocessors/
 app/preprocessors/__init__.py
 app/preprocessors/__pycache__/__init__.cpython-310.pyc
+app/schemas/
 app/schemas/__init__.py
 app/schemas/background_task.py
 app/schemas/common.py
@@ -97,6 +105,7 @@ app/schemas/__pycache__/model.cpython-310.pyc
 app/schemas/__pycache__/model_test.cpython-310.pyc
 app/schemas/__pycache__/sample_center.cpython-310.pyc
 app/schemas/__pycache__/training.cpython-310.pyc
+app/services/
 app/services/api_key_service.py
 app/services/dataset_service.py
 app/services/deploy_service.py
@@ -116,137 +125,120 @@ app/services/__pycache__/model_test_service.cpython-310.pyc
 app/services/__pycache__/sample_center_service.cpython-310.pyc
 app/services/__pycache__/training_service.cpython-310.pyc
 
-sent 8,421 bytes  received 6,982 bytes  832.59 bytes/sec
-total size is 520,877  speedup is 33.82
+sent 10,187 bytes  received 6,962 bytes  926.97 bytes/sec
+total size is 518,960  speedup is 30.26
 => Sync done.
 INFO:     Started server process [1]
 INFO:     Waiting for application startup.
-ERROR:    Traceback (most recent call last):
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 526, in _prepare_and_execute
-    prepared_stmt, attributes = await adapt_connection._prepare(
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 773, in _prepare
-    prepared_stmt = await self._connection.prepare(
-  File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 638, in prepare
-    return await self._prepare(
-  File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 657, in _prepare
-    stmt = await self._get_statement(
-  File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 443, in _get_statement
-    statement = await self._protocol.prepare(
-  File "asyncpg/protocol/protocol.pyx", line 165, in prepare
-asyncpg.exceptions.InFailedSQLTransactionError: current transaction is aborted, commands ignored until end of transaction block
-
-The above exception was the direct cause of the following exception:
-
-Traceback (most recent call last):
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1967, in _exec_single_context
-    self.dialect.do_execute(
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/default.py", line 952, in do_execute
-    cursor.execute(statement, parameters)
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 585, in execute
-    self._adapt_connection.await_(
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 132, in await_only
-    return current.parent.switch(awaitable)  # type: ignore[no-any-return,attr-defined] # noqa: E501
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 196, in greenlet_spawn
-    value = await result
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 563, in _prepare_and_execute
-    self._handle_exception(error)
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 513, in _handle_exception
-    self._adapt_connection._handle_exception(error)
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 797, in _handle_exception
-    raise translated_error from error
-sqlalchemy.dialects.postgresql.asyncpg.AsyncAdapt_asyncpg_dbapi.Error: <class 'asyncpg.exceptions.InFailedSQLTransactionError'>: current transaction is aborted, commands ignored until end of transaction block
-
-The above exception was the direct cause of the following exception:
-
-Traceback (most recent call last):
-  File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 638, in lifespan
-    async with self.lifespan_context(app) as maybe_state:
-  File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-    return await anext(self.gen)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-    async with original_context(app) as maybe_original_state:
-  File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-    return await anext(self.gen)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-    async with original_context(app) as maybe_original_state:
-  File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-    return await anext(self.gen)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-    async with original_context(app) as maybe_original_state:
-  File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-    return await anext(self.gen)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-    async with original_context(app) as maybe_original_state:
-  File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-    return await anext(self.gen)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-    async with original_context(app) as maybe_original_state:
-  File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-    return await anext(self.gen)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-    async with original_context(app) as maybe_original_state:
-  File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-    return await anext(self.gen)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-    async with original_context(app) as maybe_original_state:
-  File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-    return await anext(self.gen)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-    async with original_context(app) as maybe_original_state:
-  File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-    return await anext(self.gen)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-    async with original_context(app) as maybe_original_state:
-  File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-    return await anext(self.gen)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-    async with original_context(app) as maybe_original_state:
-  File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-    return await anext(self.gen)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-    async with original_context(app) as maybe_original_state:
-  File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-    return await anext(self.gen)
-  File "/app/main.py", line 26, in lifespan
-    await init_db()
-  File "/app/app/core/db.py", line 46, in init_db
-    await _migrate_tables()
-  File "/app/app/core/db.py", line 64, in _migrate_tables
-    await conn.execute(text(stmt))
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/ext/asyncio/engine.py", line 659, in execute
-    result = await greenlet_spawn(
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 201, in greenlet_spawn
-    result = context.throw(*sys.exc_info())
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1419, in execute
-    return meth(
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/sql/elements.py", line 527, in _execute_on_connection
-    return connection._execute_clauseelement(
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1641, in _execute_clauseelement
-    ret = self._execute_context(
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1846, in _execute_context
-    return self._exec_single_context(
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1986, in _exec_single_context
-    self._handle_dbapi_exception(
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 2363, in _handle_dbapi_exception
-    raise sqlalchemy_exception.with_traceback(exc_info[2]) from e
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1967, in _exec_single_context
-    self.dialect.do_execute(
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/default.py", line 952, in do_execute
-    cursor.execute(statement, parameters)
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 585, in execute
-    self._adapt_connection.await_(
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 132, in await_only
-    return current.parent.switch(awaitable)  # type: ignore[no-any-return,attr-defined] # noqa: E501
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 196, in greenlet_spawn
-    value = await result
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 563, in _prepare_and_execute
-    self._handle_exception(error)
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 513, in _handle_exception
-    self._adapt_connection._handle_exception(error)
-  File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 797, in _handle_exception
-    raise translated_error from error
-sqlalchemy.exc.DBAPIError: (sqlalchemy.dialects.postgresql.asyncpg.Error) <class 'asyncpg.exceptions.InFailedSQLTransactionError'>: current transaction is aborted, commands ignored until end of transaction block
-[SQL: ALTER TABLE deploy_tasks ADD COLUMN endpoint_url VARCHAR(256)]
-(Background on this error at: https://sqlalche.me/e/20/dbapi)
-
-ERROR:    Application startup failed. Exiting.
+2026-05-26 01:48:14 | INFO     | peft-platform | JobQueue started with 2 workers
+2026-05-26 01:48:14 | INFO     | peft-platform | Recovered 1 stale deploy tasks
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
+INFO:     127.0.0.1:38956 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:58486 - "GET /api/v1/models/ HTTP/1.0" 401 Unauthorized
+INFO:     172.20.0.4:58488 - "POST /api/v1/auth/refresh HTTP/1.0" 200 OK
+INFO:     172.20.0.4:58504 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:58512 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:58522 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:58518 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:58524 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:58530 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:58534 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60598 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60616 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60612 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60624 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60630 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60632 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60656 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60640 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60658 - "GET /api/v1/inference/adapters HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60682 - "GET /api/v1/api-keys/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60674 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60696 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60708 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:48096 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+2026-05-26 01:48:37 | INFO     | peft-platform | Serve task started: job=3819e7af-6c9b-4fde-88d0-35784e6afeda port=8100 (task_id=589e0e7b-ff1f-4c15-aed9-9eb562718242)
+INFO:     172.20.0.4:48102 - "POST /api/v1/deployment/serve HTTP/1.0" 200 OK
+2026-05-26 01:50:37 | INFO     | peft-platform | Remote worker launched: task=589e0e7b-ff1f-4c15-aed9-9eb562718242 port=8100 pid=92043
+INFO:     127.0.0.1:34844 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:51118 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:58876 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:48112 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:44574 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:38862 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35560 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35568 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35580 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40030 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40050 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40036 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40058 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40060 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40094 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40100 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40106 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40080 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40120 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40064 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40122 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40132 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40134 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40154 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40144 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40166 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40168 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40180 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40192 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40206 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40212 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40218 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40232 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40216 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40254 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40238 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40236 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40272 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40270 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40286 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40296 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40298 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40314 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40320 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40322 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40330 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40334 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40346 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40358 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40372 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40380 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40394 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40406 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40414 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40430 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40438 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+2026-05-26 01:51:00 | INFO     | peft-platform | Worker ready: task=589e0e7b-ff1f-4c15-aed9-9eb562718242 (after ~5s)
+INFO:     127.0.0.1:55970 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:40448 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35594 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:36428 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:45976 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:43664 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:43670 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     127.0.0.1:34970 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:45990 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46010 - "GET /api/v1/deployment/589e0e7b-ff1f-4c15-aed9-9eb562718242/status HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46004 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:33412 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:54884 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:54886 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:54896 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:54908 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:54918 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:54928 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:54940 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:34010 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:58916 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     127.0.0.1:38650 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:37086 - "GET /api/v1/deployment/services HTTP/1.0" 200 OK
+INFO:     172.20.0.4:37088 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK