|
|
@@ -1,38 +1,123 @@
|
|
|
-(base) [root@localhost ~]# mx-smi
|
|
|
-mx-smi version: 2.2.9
|
|
|
-
|
|
|
-=================== MetaX System Management Interface Log ===================
|
|
|
-Timestamp : Fri May 22 03:09:03 2026
|
|
|
-
|
|
|
-Attached GPUs : 4
|
|
|
-+---------------------------------------------------------------------------------+
|
|
|
-| MX-SMI 2.2.9 Kernel Mode Driver Version: 3.4.4 |
|
|
|
-| MACA Version: 3.3.0.15 BIOS Version: 1.30.0.0 |
|
|
|
-|------------------+-----------------+---------------------+----------------------|
|
|
|
-| Board Name | GPU Persist-M | Bus-id | GPU-Util sGPU-M |
|
|
|
-| Pwr:Usage/Cap | Temp Perf | Memory-Usage | GPU-State |
|
|
|
-|==================+=================+=====================+======================|
|
|
|
-| 0 MetaX N260 | 0 Off | 0000:b5:00.0 | 0% Disabled |
|
|
|
-| 53W / 225W | 43C P9 | 60459/65536 MiB | Available |
|
|
|
-+------------------+-----------------+---------------------+----------------------+
|
|
|
-| 1 MetaX N260 | 1 Off | 0000:b6:00.0 | 0% Disabled |
|
|
|
-| 50W / 225W | 42C P9 | 60459/65536 MiB | Available |
|
|
|
-+------------------+-----------------+---------------------+----------------------+
|
|
|
-| 2 MetaX N260 | 2 Off | 0000:b9:00.0 | 62% Disabled |
|
|
|
-| 130W / 225W | 64C P9 | 41042/65536 MiB | Available |
|
|
|
-+------------------+-----------------+---------------------+----------------------+
|
|
|
-| 3 MetaX N260 | 3 Off | 0000:bd:00.0 | 60% Disabled |
|
|
|
-| 126W / 225W | 61C P9 | 39916/65536 MiB | Available |
|
|
|
-+------------------+-----------------+---------------------+----------------------+
|
|
|
-
|
|
|
-+---------------------------------------------------------------------------------+
|
|
|
-| Process: |
|
|
|
-| GPU PID Process Name GPU Memory |
|
|
|
-| Usage(MiB) |
|
|
|
-|=================================================================================|
|
|
|
-| 0 1007916 VLLM::Worker_TP 59790 |
|
|
|
-| 1 1007917 VLLM::Worker_TP 59790 |
|
|
|
-| 2 1217897 python 5846 |
|
|
|
-| 2 1229576 python 34528 |
|
|
|
-| 3 1217897 python 5384 |
|
|
|
-| 3 1229576 python 33864
|
|
|
+2026-05-22T08:18:50.643015421Z File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 638, in prepare
|
|
|
+2026-05-22T08:18:50.643022797Z return await self._prepare(
|
|
|
+2026-05-22T08:18:50.643030223Z File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 657, in _prepare
|
|
|
+2026-05-22T08:18:50.643037457Z stmt = await self._get_statement(
|
|
|
+2026-05-22T08:18:50.643044657Z File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 443, in _get_statement
|
|
|
+2026-05-22T08:18:50.643052007Z statement = await self._protocol.prepare(
|
|
|
+2026-05-22T08:18:50.643059081Z File "asyncpg/protocol/protocol.pyx", line 165, in prepare
|
|
|
+2026-05-22T08:18:50.643066273Z asyncpg.exceptions.UndefinedColumnError: column deploy_tasks.progress does not exist
|
|
|
+2026-05-22T08:18:50.643073515Z
|
|
|
+2026-05-22T08:18:50.643080579Z The above exception was the direct cause of the following exception:
|
|
|
+2026-05-22T08:18:50.643087744Z
|
|
|
+2026-05-22T08:18:50.643094663Z Traceback (most recent call last):
|
|
|
+2026-05-22T08:18:50.643101762Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1967, in _exec_single_context
|
|
|
+2026-05-22T08:18:50.643109207Z self.dialect.do_execute(
|
|
|
+2026-05-22T08:18:50.643116203Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/default.py", line 952, in do_execute
|
|
|
+2026-05-22T08:18:50.643123550Z cursor.execute(statement, parameters)
|
|
|
+2026-05-22T08:18:50.643130624Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 585, in execute
|
|
|
+2026-05-22T08:18:50.643137909Z self._adapt_connection.await_(
|
|
|
+2026-05-22T08:18:50.643145080Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 132, in await_only
|
|
|
+2026-05-22T08:18:50.643152513Z return current.parent.switch(awaitable) # type: ignore[no-any-return,attr-defined] # noqa: E501
|
|
|
+2026-05-22T08:18:50.643191846Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 196, in greenlet_spawn
|
|
|
+2026-05-22T08:18:50.643199823Z value = await result
|
|
|
+2026-05-22T08:18:50.643207640Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 563, in _prepare_and_execute
|
|
|
+2026-05-22T08:18:50.643215143Z self._handle_exception(error)
|
|
|
+2026-05-22T08:18:50.643227474Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 513, in _handle_exception
|
|
|
+2026-05-22T08:18:50.643235388Z self._adapt_connection._handle_exception(error)
|
|
|
+2026-05-22T08:18:50.643242533Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 797, in _handle_exception
|
|
|
+2026-05-22T08:18:50.643249926Z raise translated_error from error
|
|
|
+2026-05-22T08:18:50.643257198Z sqlalchemy.dialects.postgresql.asyncpg.AsyncAdapt_asyncpg_dbapi.ProgrammingError: <class 'asyncpg.exceptions.UndefinedColumnError'>: column deploy_tasks.progress does not exist
|
|
|
+2026-05-22T08:18:50.643265144Z
|
|
|
+2026-05-22T08:18:50.643272147Z The above exception was the direct cause of the following exception:
|
|
|
+2026-05-22T08:18:50.643279941Z
|
|
|
+2026-05-22T08:18:50.643286963Z Traceback (most recent call last):
|
|
|
+2026-05-22T08:18:50.643294066Z File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 638, in lifespan
|
|
|
+2026-05-22T08:18:50.643301414Z async with self.lifespan_context(app) as maybe_state:
|
|
|
+2026-05-22T08:18:50.643308589Z File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
|
|
|
+2026-05-22T08:18:50.643315873Z return await anext(self.gen)
|
|
|
+2026-05-22T08:18:50.643322977Z File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
|
|
|
+2026-05-22T08:18:50.643330348Z async with original_context(app) as maybe_original_state:
|
|
|
+2026-05-22T08:18:50.643337567Z File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
|
|
|
+2026-05-22T08:18:50.643344825Z return await anext(self.gen)
|
|
|
+2026-05-22T08:18:50.643351927Z File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
|
|
|
+2026-05-22T08:18:50.643434023Z async with original_context(app) as maybe_original_state:
|
|
|
+2026-05-22T08:18:50.643471056Z File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
|
|
|
+2026-05-22T08:18:50.643488241Z return await anext(self.gen)
|
|
|
+2026-05-22T08:18:50.643502853Z File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
|
|
|
+2026-05-22T08:18:50.643515442Z async with original_context(app) as maybe_original_state:
|
|
|
+2026-05-22T08:18:50.643531260Z File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
|
|
|
+2026-05-22T08:18:50.643566372Z return await anext(self.gen)
|
|
|
+2026-05-22T08:18:50.643581558Z File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
|
|
|
+2026-05-22T08:18:50.643599397Z async with original_context(app) as maybe_original_state:
|
|
|
+2026-05-22T08:18:50.643611772Z File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
|
|
|
+2026-05-22T08:18:50.643623462Z return await anext(self.gen)
|
|
|
+2026-05-22T08:18:50.643634678Z File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
|
|
|
+2026-05-22T08:18:50.643648989Z async with original_context(app) as maybe_original_state:
|
|
|
+2026-05-22T08:18:50.643660765Z File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
|
|
|
+2026-05-22T08:18:50.643673291Z return await anext(self.gen)
|
|
|
+2026-05-22T08:18:50.643688183Z File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
|
|
|
+2026-05-22T08:18:50.643701236Z async with original_context(app) as maybe_original_state:
|
|
|
+2026-05-22T08:18:50.643714559Z File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
|
|
|
+2026-05-22T08:18:50.643728513Z return await anext(self.gen)
|
|
|
+2026-05-22T08:18:50.643741191Z File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
|
|
|
+2026-05-22T08:18:50.643754359Z async with original_context(app) as maybe_original_state:
|
|
|
+2026-05-22T08:18:50.643766265Z File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
|
|
|
+2026-05-22T08:18:50.643778654Z return await anext(self.gen)
|
|
|
+2026-05-22T08:18:50.643790810Z File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
|
|
|
+2026-05-22T08:18:50.643803296Z async with original_context(app) as maybe_original_state:
|
|
|
+2026-05-22T08:18:50.643876231Z File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
|
|
|
+2026-05-22T08:18:50.643894788Z return await anext(self.gen)
|
|
|
+2026-05-22T08:18:50.643932687Z File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
|
|
|
+2026-05-22T08:18:50.643948630Z async with original_context(app) as maybe_original_state:
|
|
|
+2026-05-22T08:18:50.643956593Z File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
|
|
|
+2026-05-22T08:18:50.643992791Z return await anext(self.gen)
|
|
|
+2026-05-22T08:18:50.644003789Z File "/app/main.py", line 48, in lifespan
|
|
|
+2026-05-22T08:18:50.644012003Z await deploy_service.recover_stale_deploys()
|
|
|
+2026-05-22T08:18:50.644020245Z File "/app/app/services/deploy_service.py", line 190, in recover_stale_deploys
|
|
|
+2026-05-22T08:18:50.644027950Z result = await session.execute(
|
|
|
+2026-05-22T08:18:50.644059576Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/ext/asyncio/session.py", line 449, in execute
|
|
|
+2026-05-22T08:18:50.644069585Z result = await greenlet_spawn(
|
|
|
+2026-05-22T08:18:50.644138024Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 201, in greenlet_spawn
|
|
|
+2026-05-22T08:18:50.644147460Z result = context.throw(*sys.exc_info())
|
|
|
+2026-05-22T08:18:50.644154680Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 2351, in execute
|
|
|
+2026-05-22T08:18:50.644161970Z return self._execute_internal(
|
|
|
+2026-05-22T08:18:50.644169448Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 2249, in _execute_internal
|
|
|
+2026-05-22T08:18:50.644176979Z result: Result[Any] = compile_state_cls.orm_execute_statement(
|
|
|
+2026-05-22T08:18:50.644187198Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/context.py", line 306, in orm_execute_statement
|
|
|
+2026-05-22T08:18:50.644194755Z result = conn.execute(
|
|
|
+2026-05-22T08:18:50.644202469Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1419, in execute
|
|
|
+2026-05-22T08:18:50.644210982Z return meth(
|
|
|
+2026-05-22T08:18:50.644218155Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/sql/elements.py", line 527, in _execute_on_connection
|
|
|
+2026-05-22T08:18:50.644225705Z return connection._execute_clauseelement(
|
|
|
+2026-05-22T08:18:50.644233110Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1641, in _execute_clauseelement
|
|
|
+2026-05-22T08:18:50.644240494Z ret = self._execute_context(
|
|
|
+2026-05-22T08:18:50.644247607Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1846, in _execute_context
|
|
|
+2026-05-22T08:18:50.644255628Z return self._exec_single_context(
|
|
|
+2026-05-22T08:18:50.644262798Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1986, in _exec_single_context
|
|
|
+2026-05-22T08:18:50.644270155Z self._handle_dbapi_exception(
|
|
|
+2026-05-22T08:18:50.644277493Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 2363, in _handle_dbapi_exception
|
|
|
+2026-05-22T08:18:50.644284845Z raise sqlalchemy_exception.with_traceback(exc_info[2]) from e
|
|
|
+2026-05-22T08:18:50.644292415Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1967, in _exec_single_context
|
|
|
+2026-05-22T08:18:50.644299797Z self.dialect.do_execute(
|
|
|
+2026-05-22T08:18:50.644306922Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/default.py", line 952, in do_execute
|
|
|
+2026-05-22T08:18:50.644314255Z cursor.execute(statement, parameters)
|
|
|
+2026-05-22T08:18:50.644321572Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 585, in execute
|
|
|
+2026-05-22T08:18:50.644328978Z self._adapt_connection.await_(
|
|
|
+2026-05-22T08:18:50.644336069Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 132, in await_only
|
|
|
+2026-05-22T08:18:50.644343564Z return current.parent.switch(awaitable) # type: ignore[no-any-return,attr-defined] # noqa: E501
|
|
|
+2026-05-22T08:18:50.644361572Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 196, in greenlet_spawn
|
|
|
+2026-05-22T08:18:50.644369648Z value = await result
|
|
|
+2026-05-22T08:18:50.644377041Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 563, in _prepare_and_execute
|
|
|
+2026-05-22T08:18:50.644384536Z self._handle_exception(error)
|
|
|
+2026-05-22T08:18:50.644392498Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 513, in _handle_exception
|
|
|
+2026-05-22T08:18:50.644400265Z self._adapt_connection._handle_exception(error)
|
|
|
+2026-05-22T08:18:50.644407753Z File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 797, in _handle_exception
|
|
|
+2026-05-22T08:18:50.644415847Z raise translated_error from error
|
|
|
+2026-05-22T08:18:50.644480149Z sqlalchemy.exc.ProgrammingError: (sqlalchemy.dialects.postgresql.asyncpg.ProgrammingError) <class 'asyncpg.exceptions.UndefinedColumnError'>: column deploy_tasks.progress does not exist
|
|
|
+2026-05-22T08:18:50.644492007Z [SQL: SELECT deploy_tasks.id, deploy_tasks.job_id, deploy_tasks.status, deploy_tasks.output_path, deploy_tasks.error, deploy_tasks.progress, deploy_tasks.finished_at, deploy_tasks.created_at
|
|
|
+2026-05-22T08:18:50.644499657Z FROM deploy_tasks
|
|
|
+2026-05-22T08:18:50.644506710Z WHERE deploy_tasks.status IN ($1::VARCHAR, $2::VARCHAR)]
|
|
|
+2026-05-22T08:18:50.644513902Z [parameters: ('pending', 'running')]
|
|
|
+2026-05-22T08:18:50.644521041Z (Background on this error at: https://sqlalche.me/e/20/f405)
|
|
|
+2026-05-22T08:18:50.644528216Z
|
|
|
+2026-05-22T08:18:50.644535221Z ERROR: Application startup failed. Exiting.
|