Browse Source

修复qlora报错

lxylxy123321 6 ngày trước cách đây
mục cha
commit
bc062579dd
2 tập tin đã thay đổi với 6 bổ sung123 xóa
  1. 1 0
      backend/app/engines/text_engine.py
  2. 5 123
      result.txt

+ 1 - 0
backend/app/engines/text_engine.py

@@ -91,6 +91,7 @@ class TextEngine(BaseEngine):
             "attn_implementation": "sdpa",
         }
         if quantization == "4bit" or quantization == "qlora":
+            load_kwargs["torch_dtype"] = torch.float16
             load_kwargs["quantization_config"] = BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_quant_type="nf4",

+ 5 - 123
result.txt

@@ -1,123 +1,5 @@
-2026-05-22T08:18:50.643015421Z   File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 638, in prepare
-2026-05-22T08:18:50.643022797Z     return await self._prepare(
-2026-05-22T08:18:50.643030223Z   File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 657, in _prepare
-2026-05-22T08:18:50.643037457Z     stmt = await self._get_statement(
-2026-05-22T08:18:50.643044657Z   File "/usr/local/lib/python3.10/site-packages/asyncpg/connection.py", line 443, in _get_statement
-2026-05-22T08:18:50.643052007Z     statement = await self._protocol.prepare(
-2026-05-22T08:18:50.643059081Z   File "asyncpg/protocol/protocol.pyx", line 165, in prepare
-2026-05-22T08:18:50.643066273Z asyncpg.exceptions.UndefinedColumnError: column deploy_tasks.progress does not exist
-2026-05-22T08:18:50.643073515Z 
-2026-05-22T08:18:50.643080579Z The above exception was the direct cause of the following exception:
-2026-05-22T08:18:50.643087744Z 
-2026-05-22T08:18:50.643094663Z Traceback (most recent call last):
-2026-05-22T08:18:50.643101762Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1967, in _exec_single_context
-2026-05-22T08:18:50.643109207Z     self.dialect.do_execute(
-2026-05-22T08:18:50.643116203Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/default.py", line 952, in do_execute
-2026-05-22T08:18:50.643123550Z     cursor.execute(statement, parameters)
-2026-05-22T08:18:50.643130624Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 585, in execute
-2026-05-22T08:18:50.643137909Z     self._adapt_connection.await_(
-2026-05-22T08:18:50.643145080Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 132, in await_only
-2026-05-22T08:18:50.643152513Z     return current.parent.switch(awaitable)  # type: ignore[no-any-return,attr-defined] # noqa: E501
-2026-05-22T08:18:50.643191846Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 196, in greenlet_spawn
-2026-05-22T08:18:50.643199823Z     value = await result
-2026-05-22T08:18:50.643207640Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 563, in _prepare_and_execute
-2026-05-22T08:18:50.643215143Z     self._handle_exception(error)
-2026-05-22T08:18:50.643227474Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 513, in _handle_exception
-2026-05-22T08:18:50.643235388Z     self._adapt_connection._handle_exception(error)
-2026-05-22T08:18:50.643242533Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 797, in _handle_exception
-2026-05-22T08:18:50.643249926Z     raise translated_error from error
-2026-05-22T08:18:50.643257198Z sqlalchemy.dialects.postgresql.asyncpg.AsyncAdapt_asyncpg_dbapi.ProgrammingError: <class 'asyncpg.exceptions.UndefinedColumnError'>: column deploy_tasks.progress does not exist
-2026-05-22T08:18:50.643265144Z 
-2026-05-22T08:18:50.643272147Z The above exception was the direct cause of the following exception:
-2026-05-22T08:18:50.643279941Z 
-2026-05-22T08:18:50.643286963Z Traceback (most recent call last):
-2026-05-22T08:18:50.643294066Z   File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 638, in lifespan
-2026-05-22T08:18:50.643301414Z     async with self.lifespan_context(app) as maybe_state:
-2026-05-22T08:18:50.643308589Z   File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-2026-05-22T08:18:50.643315873Z     return await anext(self.gen)
-2026-05-22T08:18:50.643322977Z   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-2026-05-22T08:18:50.643330348Z     async with original_context(app) as maybe_original_state:
-2026-05-22T08:18:50.643337567Z   File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-2026-05-22T08:18:50.643344825Z     return await anext(self.gen)
-2026-05-22T08:18:50.643351927Z   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-2026-05-22T08:18:50.643434023Z     async with original_context(app) as maybe_original_state:
-2026-05-22T08:18:50.643471056Z   File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-2026-05-22T08:18:50.643488241Z     return await anext(self.gen)
-2026-05-22T08:18:50.643502853Z   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-2026-05-22T08:18:50.643515442Z     async with original_context(app) as maybe_original_state:
-2026-05-22T08:18:50.643531260Z   File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-2026-05-22T08:18:50.643566372Z     return await anext(self.gen)
-2026-05-22T08:18:50.643581558Z   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-2026-05-22T08:18:50.643599397Z     async with original_context(app) as maybe_original_state:
-2026-05-22T08:18:50.643611772Z   File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-2026-05-22T08:18:50.643623462Z     return await anext(self.gen)
-2026-05-22T08:18:50.643634678Z   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-2026-05-22T08:18:50.643648989Z     async with original_context(app) as maybe_original_state:
-2026-05-22T08:18:50.643660765Z   File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-2026-05-22T08:18:50.643673291Z     return await anext(self.gen)
-2026-05-22T08:18:50.643688183Z   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-2026-05-22T08:18:50.643701236Z     async with original_context(app) as maybe_original_state:
-2026-05-22T08:18:50.643714559Z   File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-2026-05-22T08:18:50.643728513Z     return await anext(self.gen)
-2026-05-22T08:18:50.643741191Z   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-2026-05-22T08:18:50.643754359Z     async with original_context(app) as maybe_original_state:
-2026-05-22T08:18:50.643766265Z   File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-2026-05-22T08:18:50.643778654Z     return await anext(self.gen)
-2026-05-22T08:18:50.643790810Z   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-2026-05-22T08:18:50.643803296Z     async with original_context(app) as maybe_original_state:
-2026-05-22T08:18:50.643876231Z   File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-2026-05-22T08:18:50.643894788Z     return await anext(self.gen)
-2026-05-22T08:18:50.643932687Z   File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 216, in merged_lifespan
-2026-05-22T08:18:50.643948630Z     async with original_context(app) as maybe_original_state:
-2026-05-22T08:18:50.643956593Z   File "/usr/local/lib/python3.10/contextlib.py", line 199, in __aenter__
-2026-05-22T08:18:50.643992791Z     return await anext(self.gen)
-2026-05-22T08:18:50.644003789Z   File "/app/main.py", line 48, in lifespan
-2026-05-22T08:18:50.644012003Z     await deploy_service.recover_stale_deploys()
-2026-05-22T08:18:50.644020245Z   File "/app/app/services/deploy_service.py", line 190, in recover_stale_deploys
-2026-05-22T08:18:50.644027950Z     result = await session.execute(
-2026-05-22T08:18:50.644059576Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/ext/asyncio/session.py", line 449, in execute
-2026-05-22T08:18:50.644069585Z     result = await greenlet_spawn(
-2026-05-22T08:18:50.644138024Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 201, in greenlet_spawn
-2026-05-22T08:18:50.644147460Z     result = context.throw(*sys.exc_info())
-2026-05-22T08:18:50.644154680Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 2351, in execute
-2026-05-22T08:18:50.644161970Z     return self._execute_internal(
-2026-05-22T08:18:50.644169448Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 2249, in _execute_internal
-2026-05-22T08:18:50.644176979Z     result: Result[Any] = compile_state_cls.orm_execute_statement(
-2026-05-22T08:18:50.644187198Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/orm/context.py", line 306, in orm_execute_statement
-2026-05-22T08:18:50.644194755Z     result = conn.execute(
-2026-05-22T08:18:50.644202469Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1419, in execute
-2026-05-22T08:18:50.644210982Z     return meth(
-2026-05-22T08:18:50.644218155Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/sql/elements.py", line 527, in _execute_on_connection
-2026-05-22T08:18:50.644225705Z     return connection._execute_clauseelement(
-2026-05-22T08:18:50.644233110Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1641, in _execute_clauseelement
-2026-05-22T08:18:50.644240494Z     ret = self._execute_context(
-2026-05-22T08:18:50.644247607Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1846, in _execute_context
-2026-05-22T08:18:50.644255628Z     return self._exec_single_context(
-2026-05-22T08:18:50.644262798Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1986, in _exec_single_context
-2026-05-22T08:18:50.644270155Z     self._handle_dbapi_exception(
-2026-05-22T08:18:50.644277493Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 2363, in _handle_dbapi_exception
-2026-05-22T08:18:50.644284845Z     raise sqlalchemy_exception.with_traceback(exc_info[2]) from e
-2026-05-22T08:18:50.644292415Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1967, in _exec_single_context
-2026-05-22T08:18:50.644299797Z     self.dialect.do_execute(
-2026-05-22T08:18:50.644306922Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/engine/default.py", line 952, in do_execute
-2026-05-22T08:18:50.644314255Z     cursor.execute(statement, parameters)
-2026-05-22T08:18:50.644321572Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 585, in execute
-2026-05-22T08:18:50.644328978Z     self._adapt_connection.await_(
-2026-05-22T08:18:50.644336069Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 132, in await_only
-2026-05-22T08:18:50.644343564Z     return current.parent.switch(awaitable)  # type: ignore[no-any-return,attr-defined] # noqa: E501
-2026-05-22T08:18:50.644361572Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 196, in greenlet_spawn
-2026-05-22T08:18:50.644369648Z     value = await result
-2026-05-22T08:18:50.644377041Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 563, in _prepare_and_execute
-2026-05-22T08:18:50.644384536Z     self._handle_exception(error)
-2026-05-22T08:18:50.644392498Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 513, in _handle_exception
-2026-05-22T08:18:50.644400265Z     self._adapt_connection._handle_exception(error)
-2026-05-22T08:18:50.644407753Z   File "/usr/local/lib/python3.10/site-packages/sqlalchemy/dialects/postgresql/asyncpg.py", line 797, in _handle_exception
-2026-05-22T08:18:50.644415847Z     raise translated_error from error
-2026-05-22T08:18:50.644480149Z sqlalchemy.exc.ProgrammingError: (sqlalchemy.dialects.postgresql.asyncpg.ProgrammingError) <class 'asyncpg.exceptions.UndefinedColumnError'>: column deploy_tasks.progress does not exist
-2026-05-22T08:18:50.644492007Z [SQL: SELECT deploy_tasks.id, deploy_tasks.job_id, deploy_tasks.status, deploy_tasks.output_path, deploy_tasks.error, deploy_tasks.progress, deploy_tasks.finished_at, deploy_tasks.created_at 
-2026-05-22T08:18:50.644499657Z FROM deploy_tasks 
-2026-05-22T08:18:50.644506710Z WHERE deploy_tasks.status IN ($1::VARCHAR, $2::VARCHAR)]
-2026-05-22T08:18:50.644513902Z [parameters: ('pending', 'running')]
-2026-05-22T08:18:50.644521041Z (Background on this error at: https://sqlalche.me/e/20/f405)
-2026-05-22T08:18:50.644528216Z 
-2026-05-22T08:18:50.644535221Z ERROR:    Application startup failed. Exiting.
+(base) [root@localhost ~]# docker exec finetune-trainer cat /root/Fine-tuning/backend/data/logs/638a1786-04d7-44ea-b274-2c673aea22e2.jsonl
+{"ts": "2026-05-22T08:58:18.130363+00:00", "type": "start", "job_id": "638a1786-04d7-44ea-b274-2c673aea22e2"}
+{"ts": "2026-05-22T08:58:18.132911+00:00", "type": "status", "status": "preprocessing"}
+{"ts": "2026-05-22T08:58:22.234319+00:00", "type": "status", "status": "loading_model"}
+{"ts": "2026-05-22T08:58:42.046499+00:00", "type": "error", "message": "GPU model loading failed: We encountered some issues during automatic conversion of the weights. For details look at the `CONVERSION` entries of the above report!", "traceback": "Traceback (most recent call last):\n  File \"/root/Fine-tuning/backend/app/engines/remote_train.py\", line 157, in run_training\n    await engine.load_model(model_id, quantization=quantization_mode)\n  File \"/root/Fine-tuning/backend/app/engines/text_engine.py\", line 131, in load_model\n    raise RuntimeError(f\"GPU model loading failed: {load_error[0]}\")\nRuntimeError: GPU model loading failed: We encountered some issues during automatic conversion of the weights. For details look at the `CONVERSION` entries of the above report!\n"}