فهرست منبع

解决阻塞问题

lxylxy123321 1 هفته پیش
والد
کامیت
4ab9c09234
2فایلهای تغییر یافته به همراه373 افزوده شده و 58 حذف شده
  1. 9 10
      backend/app/core/job_queue.py
  2. 364 48
      result.txt

+ 9 - 10
backend/app/core/job_queue.py

@@ -276,10 +276,10 @@ class JobQueue:
             return text_engine
 
     async def _poll_remote_progress(self, job_id: str, pid: str):
-        """通过 SSH 读取远程日志文件,解析训练进度。"""
+        """通过 SSH 读取远程日志文件,解析训练进度(非阻塞)。"""
         from app.config import get_settings
         from app.core.websocket import send_progress, send_epoch_done, send_completed, send_error
-        from app.core.remote_executor import ssh_exec
+        from app.core.remote_executor import ssh_exec, is_process_running
 
         settings = get_settings()
         remote_log = f"{settings.compute_node_remote_data_dir}/logs/{job_id}.jsonl"
@@ -290,19 +290,18 @@ class JobQueue:
         for _ in range(max_polls):
             if self.is_cancelled(job_id):
                 _s = get_settings()
-                ssh_exec(f"docker exec {_s.compute_node_docker_container} bash -c 'kill {pid} 2>/dev/null'", timeout=10)
+                await asyncio.to_thread(ssh_exec, f"docker exec {_s.compute_node_docker_container} bash -c 'kill {pid} 2>/dev/null'", timeout=10)
                 self.update_job(job_id, status=JobStatus.CANCELLED)
                 await self._notify_callbacks()
                 await send_error(job_id, "Training cancelled")
                 return
 
-            # 检查进程是否还在运行
-            from app.core.remote_executor import is_process_running
-            process_alive = is_process_running(pid)
+            # 检查进程是否还在运行(非阻塞)
+            process_alive = await asyncio.to_thread(is_process_running, pid)
 
-            # 通过 SSH 远程读取日志文件(追加部分
+            # 通过 SSH 远程读取日志文件(非阻塞
             cat_cmd = f"docker exec {settings.compute_node_docker_container} bash -c 'wc -c < {remote_log} 2>/dev/null || echo 0'"
-            code, size_out, _ = ssh_exec(cat_cmd, timeout=30)
+            code, size_out, _ = await asyncio.to_thread(ssh_exec, cat_cmd, timeout=30)
             try:
                 file_size = int(size_out.strip()) if code == 0 and size_out.strip() else 0
             except ValueError:
@@ -310,7 +309,7 @@ class JobQueue:
 
             if file_size > last_bytes:
                 read_cmd = f"docker exec {settings.compute_node_docker_container} bash -c 'tail -c +{last_bytes + 1} {remote_log} 2>/dev/null'"
-                code, log_content, _ = ssh_exec(read_cmd, timeout=30)
+                code, log_content, _ = await asyncio.to_thread(ssh_exec, read_cmd, timeout=30)
 
                 if code == 0 and log_content.strip():
                     for line in log_content.strip().split("\n"):
@@ -363,7 +362,7 @@ class JobQueue:
             # 进程已退出但日志里没有 completed/error
             if not process_alive:
                 await asyncio.sleep(2)
-                if not is_process_running(pid):
+                if not await asyncio.to_thread(is_process_running, pid):
                     self.update_job(job_id,
                                     status=JobStatus.FAILED,
                                     error_message=f"Remote process exited unexpectedly (pid={pid})")

+ 364 - 48
result.txt

@@ -1,48 +1,364 @@
-(base) [root@localhost ~]# docker exec -w /root/Fine-tuning/backend finetune-trainer /opt/conda/bin/python -m app.engines.remote_train cce886de-4dd5-460a-b0ac-2404731cd9f8 Qwen/Qwen3.5-0.8B text yanalong/yanalong /root/Fine-tuning/backend/data/config_cce886de-4dd5-460a-b0ac-2404731cd9f8.json
-Traceback (most recent call last):
-  File "/opt/conda/lib/python3.10/site-packages/pydantic_settings/sources/base.py", line 551, in __call__
-    field_value = self.prepare_field_value(field_name, field, field_value, value_is_complex)
-  File "/opt/conda/lib/python3.10/site-packages/pydantic_settings/sources/providers/env.py", line 134, in prepare_field_value
-    raise e
-  File "/opt/conda/lib/python3.10/site-packages/pydantic_settings/sources/providers/env.py", line 131, in prepare_field_value
-    value = self.decode_complex_value(field_name, field, value)
-  File "/opt/conda/lib/python3.10/site-packages/pydantic_settings/sources/base.py", line 194, in decode_complex_value
-    return json.loads(value)
-  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
-    return _default_decoder.decode(s)
-  File "/opt/conda/lib/python3.10/json/decoder.py", line 337, in decode
-    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
-  File "/opt/conda/lib/python3.10/json/decoder.py", line 355, in raw_decode
-    raise JSONDecodeError("Expecting value", s, err.value) from None
-json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
-
-The above exception was the direct cause of the following exception:
-
-Traceback (most recent call last):
-  File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
-    return _run_code(code, main_globals, None,
-  File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
-    exec(code, run_globals)
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 179, in <module>
-    main()
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 175, in main
-    asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config))
-  File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
-    return loop.run_until_complete(main)
-  File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
-    return future.result()
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 78, in run_training
-    from app.core.logging import logger
-  File "/root/Fine-tuning/backend/app/core/logging.py", line 5, in <module>
-    settings = get_settings()
-  File "/root/Fine-tuning/backend/app/config.py", line 141, in get_settings
-    settings = Settings()
-  File "/opt/conda/lib/python3.10/site-packages/pydantic_settings/main.py", line 247, in __init__
-    super().__init__(**__pydantic_self__.__class__._settings_build_values(sources, init_kwargs))
-  File "/opt/conda/lib/python3.10/site-packages/pydantic_settings/main.py", line 470, in _settings_build_values
-    source_state = source()
-  File "/opt/conda/lib/python3.10/site-packages/pydantic_settings/sources/providers/dotenv.py", line 112, in __call__
-    data: dict[str, Any] = super().__call__()
-  File "/opt/conda/lib/python3.10/site-packages/pydantic_settings/sources/base.py", line 553, in __call__
-    raise SettingsError(
-pydantic_settings.exceptions.SettingsError: error parsing value for field "backend_cors_origins" from source "DotEnvSettingsSource"
+finetune-backend  | 2026-05-20 05:13:34 | INFO     | peft-platform | Remote training launched in container: job=a52d395e-d3c8-40d2-9be3-1839f597dc7f, container_pid=12699
+finetune-backend  | INFO:     127.0.0.1:59032 - "GET /health HTTP/1.1" 200 OK
+finetune-backend  | INFO:     172.20.0.4:56196 - "GET /api/v1/training/jobs HTTP/1.0" 500 Internal Server Error
+finetune-backend  | ERROR:    Exception in ASGI application
+finetune-backend  | Traceback (most recent call last):
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 2443, in connect
+finetune-backend  |     return await connect_utils._connect(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connect_utils.py", line 1218, in _connect
+finetune-backend  |     conn = await _connect_addr(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connect_utils.py", line 1054, in _connect_addr
+finetune-backend  |     return await __connect_addr(params, True, *args)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connect_utils.py", line 1102, in __connect_addr
+finetune-backend  |     await connected
+finetune-backend  | asyncio.exceptions.CancelledError
+finetune-backend  | 
+finetune-backend  | During handling of the above exception, another exception occurred:
+finetune-backend  | 
+finetune-backend  | Traceback (most recent call last):
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/uvicorn/protocols/http/httptools_impl.py", line 421, in run_asgi
+finetune-backend  |     result = await app(  # type: ignore[func-returns-value]
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 56, in __call__
+finetune-backend  |     return await self.app(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/applications.py", line 1159, in __call__
+finetune-backend  |     await super().__call__(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/applications.py", line 90, in __call__
+finetune-backend  |     await self.middleware_stack(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
+finetune-backend  |     raise exc
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
+finetune-backend  |     await self.app(scope, receive, _send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/middleware/cors.py", line 88, in __call__
+finetune-backend  |     await self.app(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 63, in __call__
+finetune-backend  |     await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
+finetune-backend  |     raise exc
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
+finetune-backend  |     await app(scope, receive, sender)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/middleware/asyncexitstack.py", line 18, in __call__
+finetune-backend  |     await self.app(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 660, in __call__
+finetune-backend  |     await self.middleware_stack(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 680, in app
+finetune-backend  |     await route.handle(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 276, in handle
+finetune-backend  |     await self.app(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 134, in app
+finetune-backend  |     await wrap_app_handling_exceptions(app, request)(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
+finetune-backend  |     raise exc
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
+finetune-backend  |     await app(scope, receive, sender)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 120, in app
+finetune-backend  |     response = await f(request)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 674, in app
+finetune-backend  |     raw_response = await run_endpoint_function(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 328, in run_endpoint_function
+finetune-backend  |     return await dependant.call(**values)
+finetune-backend  |   File "/app/app/api/training.py", line 20, in list_training_jobs
+finetune-backend  |     items = await training_service.list_training_jobs()
+finetune-backend  |   File "/app/app/services/training_service.py", line 87, in list_training_jobs
+finetune-backend  |     result = await session.execute(select(TrainingJobModel).order_by(TrainingJobModel.created_at.desc()))
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/ext/asyncio/session.py", line 449, in execute
+finetune-backend  |     result = await greenlet_spawn(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 201, in greenlet_spawn
+finetune-backend  |     result = context.throw(*sys.exc_info())
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 2351, in execute
+finetune-backend  |     return self._execute_internal(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 2239, in _execute_internal
+finetune-backend  |     conn = self._connection_for_bind(bind)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 2108, in _connection_for_bind
+finetune-backend  |     return trans._connection_for_bind(engine, execution_options)
+finetune-backend  |   File "<string>", line 2, in _connection_for_bind
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/state_changes.py", line 137, in _go
+finetune-backend  |     ret_value = fn(self, *arg, **kw)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 1187, in _connection_for_bind
+finetune-backend  |     conn = bind.connect()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 3293, in connect
+finetune-backend  |     return self._connection_cls(self)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 143, in __init__
+finetune-backend  |     self._dbapi_connection = engine.raw_connection()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 3317, in raw_connection
+finetune-backend  |     return self.pool.connect()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 448, in connect
+finetune-backend  |     return _ConnectionFairy._checkout(self)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 1272, in _checkout
+finetune-backend  |     fairy = _ConnectionRecord.checkout(pool)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 712, in checkout
+finetune-backend  |     rec = pool._do_get()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/impl.py", line 177, in _do_get
+finetune-backend  |     with util.safe_reraise():
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/langhelpers.py", line 121, in __exit__
+finetune-backend  |     raise exc_value.with_traceback(exc_tb)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/impl.py", line 175, in _do_get
+finetune-backend  |     return self._create_connection()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 389, in _create_connection
+finetune-backend  |     return _ConnectionRecord(self)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 674, in __init__
+finetune-backend  | INFO:     172.20.0.4:56212 - "GET /api/v1/models/ HTTP/1.0" 500 Internal Server Error
+finetune-backend  |     self.__connect()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 900, in __connect
+finetune-backend  |     with util.safe_reraise():
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/langhelpers.py", line 121, in __exit__
+finetune-backend  |     raise exc_value.with_traceback(exc_tb)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 896, in __connect
+finetune-backend  |     self.dbapi_connection = connection = pool._invoke_creator(self)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/create.py", line 667, in connect
+finetune-backend  |     return dialect.connect(*cargs_tup, **cparams)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/default.py", line 630, in connect
+finetune-backend  |     return self.loaded_dbapi.connect(*cargs, **cparams)  # type: ignore[no-any-return]  # NOQA: E501
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 955, in connect
+finetune-backend  |     await_only(creator_fn(*arg, **kw)),
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 132, in await_only
+finetune-backend  |     return current.parent.switch(awaitable)  # type: ignore[no-any-return,attr-defined] # noqa: E501
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 196, in greenlet_spawn
+finetune-backend  |     value = await result
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 2442, in connect
+finetune-backend  |     async with compat.timeout(timeout):
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/async_timeout/__init__.py", line 179, in __aexit__
+finetune-backend  |     self._do_exit(exc_type)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/async_timeout/__init__.py", line 265, in _do_exit
+finetune-backend  |     raise asyncio.TimeoutError
+finetune-backend  | asyncio.exceptions.TimeoutError
+finetune-backend  | ERROR:    Exception in ASGI application
+finetune-backend  | Traceback (most recent call last):
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 2443, in connect
+finetune-backend  |     return await connect_utils._connect(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connect_utils.py", line 1218, in _connect
+finetune-backend  |     conn = await _connect_addr(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connect_utils.py", line 1054, in _connect_addr
+finetune-backend  |     return await __connect_addr(params, True, *args)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connect_utils.py", line 1102, in __connect_addr
+finetune-backend  |     await connected
+finetune-backend  | asyncio.exceptions.CancelledError
+finetune-backend  | 
+finetune-backend  | During handling of the above exception, another exception occurred:
+finetune-backend  | 
+finetune-backend  | Traceback (most recent call last):
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/uvicorn/protocols/http/httptools_impl.py", line 421, in run_asgi
+finetune-backend  |     result = await app(  # type: ignore[func-returns-value]
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 56, in __call__
+finetune-backend  |     return await self.app(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/applications.py", line 1159, in __call__
+finetune-backend  |     await super().__call__(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/applications.py", line 90, in __call__
+finetune-backend  |     await self.middleware_stack(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
+finetune-backend  |     raise exc
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
+finetune-backend  |     await self.app(scope, receive, _send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/middleware/cors.py", line 88, in __call__
+finetune-backend  |     await self.app(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 63, in __call__
+finetune-backend  |     await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
+finetune-backend  |     raise exc
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
+finetune-backend  |     await app(scope, receive, sender)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/middleware/asyncexitstack.py", line 18, in __call__
+finetune-backend  |     await self.app(scope, receive, send)
+finetune-backend  | INFO:     172.20.0.4:56228 - "GET /api/v1/datasets/ HTTP/1.0" 500 Internal Server Error
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 660, in __call__
+finetune-backend  |     await self.middleware_stack(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 680, in app
+finetune-backend  |     await route.handle(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 276, in handle
+finetune-backend  |     await self.app(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 134, in app
+finetune-backend  |     await wrap_app_handling_exceptions(app, request)(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
+finetune-backend  |     raise exc
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
+finetune-backend  |     await app(scope, receive, sender)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 120, in app
+finetune-backend  |     response = await f(request)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 674, in app
+finetune-backend  |     raw_response = await run_endpoint_function(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 328, in run_endpoint_function
+finetune-backend  |     return await dependant.call(**values)
+finetune-backend  |   File "/app/app/api/models.py", line 13, in list_models
+finetune-backend  |     models = await model_service.list_cached_models()
+finetune-backend  |   File "/app/app/services/model_service.py", line 123, in list_cached_models
+finetune-backend  |     result = await session.execute(select(ModelCache).order_by(ModelCache.created_at.desc()))
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/ext/asyncio/session.py", line 449, in execute
+finetune-backend  |     result = await greenlet_spawn(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 201, in greenlet_spawn
+finetune-backend  |     result = context.throw(*sys.exc_info())
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 2351, in execute
+finetune-backend  |     return self._execute_internal(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 2239, in _execute_internal
+finetune-backend  |     conn = self._connection_for_bind(bind)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 2108, in _connection_for_bind
+finetune-backend  |     return trans._connection_for_bind(engine, execution_options)
+finetune-backend  |   File "<string>", line 2, in _connection_for_bind
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/state_changes.py", line 137, in _go
+finetune-backend  |     ret_value = fn(self, *arg, **kw)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 1187, in _connection_for_bind
+finetune-backend  |     conn = bind.connect()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 3293, in connect
+finetune-backend  |     return self._connection_cls(self)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 143, in __init__
+finetune-backend  |     self._dbapi_connection = engine.raw_connection()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 3317, in raw_connection
+finetune-backend  |     return self.pool.connect()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 448, in connect
+finetune-backend  |     return _ConnectionFairy._checkout(self)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 1272, in _checkout
+finetune-backend  |     fairy = _ConnectionRecord.checkout(pool)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 712, in checkout
+finetune-backend  |     rec = pool._do_get()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/impl.py", line 177, in _do_get
+finetune-backend  |     with util.safe_reraise():
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/langhelpers.py", line 121, in __exit__
+finetune-backend  |     raise exc_value.with_traceback(exc_tb)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/impl.py", line 175, in _do_get
+finetune-backend  |     return self._create_connection()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 389, in _create_connection
+finetune-backend  |     return _ConnectionRecord(self)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 674, in __init__
+finetune-backend  |     self.__connect()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 900, in __connect
+finetune-backend  |     with util.safe_reraise():
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/langhelpers.py", line 121, in __exit__
+finetune-backend  |     raise exc_value.with_traceback(exc_tb)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 896, in __connect
+finetune-backend  |     self.dbapi_connection = connection = pool._invoke_creator(self)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/create.py", line 667, in connect
+finetune-backend  |     return dialect.connect(*cargs_tup, **cparams)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/default.py", line 630, in connect
+finetune-backend  |     return self.loaded_dbapi.connect(*cargs, **cparams)  # type: ignore[no-any-return]  # NOQA: E501
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 955, in connect
+finetune-backend  |     await_only(creator_fn(*arg, **kw)),
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 132, in await_only
+finetune-backend  |     return current.parent.switch(awaitable)  # type: ignore[no-any-return,attr-defined] # noqa: E501
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 196, in greenlet_spawn
+finetune-backend  |     value = await result
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 2442, in connect
+finetune-backend  |     async with compat.timeout(timeout):
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/async_timeout/__init__.py", line 179, in __aexit__
+finetune-backend  |     self._do_exit(exc_type)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/async_timeout/__init__.py", line 265, in _do_exit
+finetune-backend  |     raise asyncio.TimeoutError
+finetune-backend  | asyncio.exceptions.TimeoutError
+finetune-backend  | ERROR:    Exception in ASGI application
+finetune-backend  | Traceback (most recent call last):
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 2443, in connect
+finetune-backend  |     return await connect_utils._connect(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connect_utils.py", line 1218, in _connect
+finetune-backend  |     conn = await _connect_addr(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connect_utils.py", line 1054, in _connect_addr
+finetune-backend  |     return await __connect_addr(params, True, *args)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connect_utils.py", line 1102, in __connect_addr
+finetune-backend  |     await connected
+finetune-backend  | asyncio.exceptions.CancelledError
+finetune-backend  | 
+finetune-backend  | During handling of the above exception, another exception occurred:
+finetune-backend  | 
+finetune-backend  | Traceback (most recent call last):
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/uvicorn/protocols/http/httptools_impl.py", line 421, in run_asgi
+finetune-backend  |     result = await app(  # type: ignore[func-returns-value]
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 56, in __call__
+finetune-backend  |     return await self.app(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/applications.py", line 1159, in __call__
+finetune-backend  |     await super().__call__(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/applications.py", line 90, in __call__
+finetune-backend  |     await self.middleware_stack(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
+finetune-backend  |     raise exc
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
+finetune-backend  |     await self.app(scope, receive, _send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/middleware/cors.py", line 88, in __call__
+finetune-backend  |     await self.app(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 63, in __call__
+finetune-backend  |     await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
+finetune-backend  |     raise exc
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
+finetune-backend  |     await app(scope, receive, sender)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/middleware/asyncexitstack.py", line 18, in __call__
+finetune-backend  |     await self.app(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 660, in __call__
+finetune-backend  |     await self.middleware_stack(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 680, in app
+finetune-backend  |     await route.handle(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 276, in handle
+finetune-backend  |     await self.app(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 134, in app
+finetune-backend  |     await wrap_app_handling_exceptions(app, request)(scope, receive, send)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
+finetune-backend  |     raise exc
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
+finetune-backend  |     await app(scope, receive, sender)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 120, in app
+finetune-backend  |     response = await f(request)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 674, in app
+finetune-backend  |     raw_response = await run_endpoint_function(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 328, in run_endpoint_function
+finetune-backend  |     return await dependant.call(**values)
+finetune-backend  |   File "/app/app/api/datasets.py", line 48, in list_datasets
+finetune-backend  |     items = await dataset_service.list_datasets()
+finetune-backend  |   File "/app/app/services/dataset_service.py", line 361, in list_datasets
+finetune-backend  |     result = await session.execute(select(DatasetRecord).order_by(DatasetRecord.created_at.desc()))
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/ext/asyncio/session.py", line 449, in execute
+finetune-backend  |     result = await greenlet_spawn(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 201, in greenlet_spawn
+finetune-backend  |     result = context.throw(*sys.exc_info())
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 2351, in execute
+finetune-backend  |     return self._execute_internal(
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 2239, in _execute_internal
+finetune-backend  |     conn = self._connection_for_bind(bind)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 2108, in _connection_for_bind
+finetune-backend  |     return trans._connection_for_bind(engine, execution_options)
+finetune-backend  |   File "<string>", line 2, in _connection_for_bind
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/state_changes.py", line 137, in _go
+finetune-backend  |     ret_value = fn(self, *arg, **kw)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 1187, in _connection_for_bind
+finetune-backend  |     conn = bind.connect()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 3293, in connect
+finetune-backend  |     return self._connection_cls(self)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 143, in __init__
+finetune-backend  |     self._dbapi_connection = engine.raw_connection()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 3317, in raw_connection
+finetune-backend  |     return self.pool.connect()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 448, in connect
+finetune-backend  |     return _ConnectionFairy._checkout(self)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 1272, in _checkout
+finetune-backend  |     fairy = _ConnectionRecord.checkout(pool)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 712, in checkout
+finetune-backend  |     rec = pool._do_get()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/impl.py", line 177, in _do_get
+finetune-backend  |     with util.safe_reraise():
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/langhelpers.py", line 121, in __exit__
+finetune-backend  |     raise exc_value.with_traceback(exc_tb)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/impl.py", line 175, in _do_get
+finetune-backend  |     return self._create_connection()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 389, in _create_connection
+finetune-backend  |     return _ConnectionRecord(self)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 674, in __init__
+finetune-backend  |     self.__connect()
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 900, in __connect
+finetune-backend  |     with util.safe_reraise():
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/langhelpers.py", line 121, in __exit__
+finetune-backend  |     raise exc_value.with_traceback(exc_tb)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 896, in __connect
+finetune-backend  |     self.dbapi_connection = connection = pool._invoke_creator(self)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/create.py", line 667, in connect
+finetune-backend  |     return dialect.connect(*cargs_tup, **cparams)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/default.py", line 630, in connect
+finetune-backend  |     return self.loaded_dbapi.connect(*cargs, **cparams)  # type: ignore[no-any-return]  # NOQA: E501
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 955, in connect
+finetune-backend  |     await_only(creator_fn(*arg, **kw)),
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 132, in await_only
+finetune-backend  |     return current.parent.switch(awaitable)  # type: ignore[no-any-return,attr-defined] # noqa: E501
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 196, in greenlet_spawn
+finetune-backend  |     value = await result
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 2442, in connect
+finetune-backend  |     async with compat.timeout(timeout):
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/async_timeout/__init__.py", line 179, in __aexit__
+finetune-backend  |     self._do_exit(exc_type)
+finetune-backend  |   File "/usr/local/lib/python3.10/site-packages/async_timeout/__init__.py", line 265, in _do_exit
+finetune-backend  |     raise asyncio.TimeoutError
+finetune-backend  | asyncio.exceptions.TimeoutError
+finetune-backend  | INFO:     127.0.0.1:44850 - "GET /health HTTP/1.1" 200 OK
+finetune-backend  | 2026-05-20 05:14:27 | INFO     | peft-platform | Remote training launched for job a52d395e-d3c8-40d2-9be3-1839f597dc7f