ソースを参照

修复数据集下载落库问题,修复调用gpu问题

lxylxy123321 1 週間 前
コミット
57290cdfa0

+ 3 - 3
backend/app/services/dataset_service.py

@@ -1,7 +1,7 @@
 import asyncio
 import json
 import uuid
-from datetime import datetime, timezone
+from datetime import datetime
 from pathlib import Path
 from typing import Any
 
@@ -106,7 +106,7 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
             format="jsonl",
             record_count=record_count,
             file_path=str(jsonl_path),
-            created_at=datetime.now(timezone.utc),
+            created_at=datetime.utcnow(),
         )
         async with async_session() as session:
             session.add(record)
@@ -190,7 +190,7 @@ async def upload_dataset(file: UploadFile) -> dict[str, Any]:
         format=fmt,
         record_count=record_count,
         file_path=str(file_path),
-        created_at=datetime.now(timezone.utc),
+        created_at=datetime.utcnow(),
     )
     async with async_session() as session:
         session.add(record)

+ 2 - 2
backend/app/services/deploy_service.py

@@ -1,5 +1,5 @@
 import uuid
-from datetime import datetime, timezone
+from datetime import datetime
 from pathlib import Path
 from typing import Any
 
@@ -27,7 +27,7 @@ async def export_adapter(job_id: str, config: dict[str, Any]) -> dict[str, Any]:
         id=task_id,
         job_id=job_id,
         status="pending",
-        created_at=datetime.now(timezone.utc),
+        created_at=datetime.utcnow(),
     )
     async with async_session() as session:
         session.add(task)

+ 2 - 2
backend/app/services/eval_service.py

@@ -1,6 +1,6 @@
 import json
 import uuid
-from datetime import datetime, timezone
+from datetime import datetime
 from typing import Any
 
 from app.config import get_settings
@@ -65,7 +65,7 @@ async def run_evaluation(job_id: str, config: dict[str, Any]) -> dict[str, Any]:
             id=eval_id,
             job_id=job_id,
             metrics=json.dumps(metrics),
-            created_at=datetime.now(timezone.utc),
+            created_at=datetime.utcnow(),
         )
         async with async_session() as session:
             session.add(eval_record)

+ 10 - 8
backend/app/services/model_test_service.py

@@ -16,11 +16,12 @@ async def test_model(model_id: str, prompt: str, max_new_tokens: int = 128, temp
 
 def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
     """通过 SSH 在算力节点执行模型测试。"""
+    import base64
     import json
     from app.core.remote_executor import ssh_exec
 
-    # 将 prompt 中的单引号转义,用于 Python 字符串格式化
-    safe_prompt = prompt.replace("'", "\\'")
+    # 将 prompt 中的单引号/反斜杠转义
+    safe_prompt = prompt.replace("\\", "\\\\").replace("'", "\\'")
 
     python_script = """\
 import json, asyncio
@@ -59,14 +60,15 @@ print(json.dumps({'generated_text': gen}))
 
     container = settings.compute_node_docker_container
     python = settings.compute_node_python
+    workdir = settings.compute_node_workdir
 
-    # 使用 docker exec -i + heredoc 传递脚本到容器内 Python stdin,
-    # 避免长命令被截断或引号解析错误;-w 指定工作目录确保 app 模块可导入
+    # 用 base64 编码脚本,通过 bash -c 执行:
+    # 1. bash -c 能激活 conda 环境(与训练命令一致)
+    # 2. base64 避免引号嵌套和命令截断问题
+    script_b64 = base64.b64encode(python_script.encode()).decode()
     remote_cmd = (
-        f"docker exec -i -w {settings.compute_node_workdir} {container} "
-        f"{python} << 'PYTHON_SCRIPT_EOF'\n"
-        f"{python_script}\n"
-        f"PYTHON_SCRIPT_EOF"
+        f"docker exec -w {workdir} {container} "
+        f"bash -c 'echo {script_b64} | base64 -d | {python}'"
     )
 
     code, stdout, stderr = ssh_exec(remote_cmd, timeout=600)

+ 5 - 5
backend/app/services/training_service.py

@@ -1,7 +1,7 @@
 import asyncio
 import json
 import uuid
-from datetime import datetime, timezone
+from datetime import datetime
 from typing import Any
 
 from app.config import get_settings
@@ -47,7 +47,7 @@ async def create_training_job(config: dict[str, Any]) -> dict[str, Any]:
         lora_dropout=config.get("lora_dropout", 0.05),
         lora_target_modules=config.get("lora_target_modules", "all-linear"),
         qlora_bits=config.get("qlora_bits", 4),
-        created_at=datetime.now(timezone.utc),
+        created_at=datetime.utcnow(),
     )
     async with async_session() as session:
         session.add(record)
@@ -109,7 +109,7 @@ async def cancel_training_job(job_id: str) -> dict[str, Any]:
         record = result.scalar_one_or_none()
         if record:
             record.status = "cancelled"
-            record.finished_at = datetime.now(timezone.utc)
+            record.finished_at = datetime.utcnow()
             await session.commit()
 
     logger.info(f"Job cancelled: {job_id}")
@@ -132,9 +132,9 @@ async def update_job_in_db(job):
                 record.adapter_path = job.adapter_path
                 record.error_message = job.error_message
                 if job.status == JobStatus.TRAINING and not record.started_at:
-                    record.started_at = datetime.now(timezone.utc)
+                    record.started_at = datetime.utcnow()
                 if job.status.is_terminal:
-                    record.finished_at = datetime.now(timezone.utc)
+                    record.finished_at = datetime.utcnow()
                 await session.commit()
     except Exception as e:
         logger.error(f"Failed to update job {job.id} in DB: {e}")

+ 18 - 183
result.txt

@@ -1,183 +1,18 @@
-  File "/usr/local/lib/python3.10/site-packages/starlette/applications.py", line 90, in __call__
-    await self.middleware_stack(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
-    raise exc
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
-    await self.app(scope, receive, _send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/cors.py", line 88, in __call__
-    await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 63, in __call__
-    await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
-    raise exc
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
-    await app(scope, receive, sender)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/middleware/asyncexitstack.py", line 18, in __call__
-    await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 660, in __call__
-    await self.middleware_stack(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 680, in app
-    await route.handle(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 276, in handle
-    await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 134, in app
-    await wrap_app_handling_exceptions(app, request)(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
-    raise exc
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
-    await app(scope, receive, sender)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 120, in app
-    response = await f(request)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 674, in app
-    raw_response = await run_endpoint_function(
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 328, in run_endpoint_function
-    return await dependant.call(**values)
-  File "/app/app/api/models.py", line 13, in list_models
-    models = await model_service.list_cached_models()
-  File "/app/app/services/model_service.py", line 126, in list_cached_models
-    async with async_session() as session:
-AttributeError: __aenter__
-INFO:     172.20.0.4:55612 - "GET /api/v1/models/ HTTP/1.0" 500 Internal Server Error
-ERROR:    Exception in ASGI application
-Traceback (most recent call last):
-  File "/usr/local/lib/python3.10/site-packages/uvicorn/protocols/http/httptools_impl.py", line 421, in run_asgi
-    result = await app(  # type: ignore[func-returns-value]
-  File "/usr/local/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 56, in __call__
-    return await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/applications.py", line 1159, in __call__
-    await super().__call__(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/applications.py", line 90, in __call__
-    await self.middleware_stack(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
-    raise exc
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
-    await self.app(scope, receive, _send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/cors.py", line 88, in __call__
-    await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 63, in __call__
-    await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
-    raise exc
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
-    await app(scope, receive, sender)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/middleware/asyncexitstack.py", line 18, in __call__
-    await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 660, in __call__
-    await self.middleware_stack(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 680, in app
-    await route.handle(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 276, in handle
-    await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 134, in app
-    await wrap_app_handling_exceptions(app, request)(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
-    raise exc
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
-    await app(scope, receive, sender)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 120, in app
-    response = await f(request)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 674, in app
-    raw_response = await run_endpoint_function(
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 328, in run_endpoint_function
-    return await dependant.call(**values)
-  File "/app/app/api/models.py", line 13, in list_models
-    models = await model_service.list_cached_models()
-  File "/app/app/services/model_service.py", line 126, in list_cached_models
-    async with async_session() as session:
-AttributeError: __aenter__
-INFO:     172.20.0.4:55622 - "GET /api/v1/models/ HTTP/1.0" 500 Internal Server Error
-ERROR:    Exception in ASGI application
-Traceback (most recent call last):
-  File "/usr/local/lib/python3.10/site-packages/uvicorn/protocols/http/httptools_impl.py", line 421, in run_asgi
-    result = await app(  # type: ignore[func-returns-value]
-  File "/usr/local/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 56, in __call__
-    return await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/applications.py", line 1159, in __call__
-    await super().__call__(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/applications.py", line 90, in __call__
-    await self.middleware_stack(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
-    raise exc
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
-    await self.app(scope, receive, _send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/cors.py", line 88, in __call__
-    await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 63, in __call__
-    await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
-    raise exc
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
-    await app(scope, receive, sender)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/middleware/asyncexitstack.py", line 18, in __call__
-    await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 660, in __call__
-    await self.middleware_stack(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 680, in app
-    await route.handle(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 276, in handle
-    await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 134, in app
-    await wrap_app_handling_exceptions(app, request)(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
-    raise exc
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
-    await app(scope, receive, sender)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 120, in app
-    response = await f(request)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 674, in app
-    raw_response = await run_endpoint_function(
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 328, in run_endpoint_function
-    return await dependant.call(**values)
-  File "/app/app/api/models.py", line 13, in list_models
-    models = await model_service.list_cached_models()
-  File "/app/app/services/model_service.py", line 126, in list_cached_models
-    async with async_session() as session:
-AttributeError: __aenter__
-INFO:     172.20.0.4:55624 - "GET /api/v1/models/ HTTP/1.0" 500 Internal Server Error
-ERROR:    Exception in ASGI application
-Traceback (most recent call last):
-  File "/usr/local/lib/python3.10/site-packages/uvicorn/protocols/http/httptools_impl.py", line 421, in run_asgi
-    result = await app(  # type: ignore[func-returns-value]
-  File "/usr/local/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 56, in __call__
-    return await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/applications.py", line 1159, in __call__
-    await super().__call__(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/applications.py", line 90, in __call__
-    await self.middleware_stack(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
-    raise exc
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
-    await self.app(scope, receive, _send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/cors.py", line 88, in __call__
-    await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 63, in __call__
-    await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
-    raise exc
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
-    await app(scope, receive, sender)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/middleware/asyncexitstack.py", line 18, in __call__
-    await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 660, in __call__
-    await self.middleware_stack(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 680, in app
-    await route.handle(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/routing.py", line 276, in handle
-    await self.app(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 134, in app
-    await wrap_app_handling_exceptions(app, request)(scope, receive, send)
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
-    raise exc
-  File "/usr/local/lib/python3.10/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
-    await app(scope, receive, sender)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 120, in app
-    response = await f(request)
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 674, in app
-    raw_response = await run_endpoint_function(
-  File "/usr/local/lib/python3.10/site-packages/fastapi/routing.py", line 328, in run_endpoint_function
-    return await dependant.call(**values)
-  File "/app/app/api/models.py", line 13, in list_models
-    models = await model_service.list_cached_models()
-  File "/app/app/services/model_service.py", line 126, in list_cached_models
-    async with async_session() as session:
-AttributeError: __aenter__
+2026-05-19 15:16:02 | INFO     | peft-platform | Remote test result: code=1, stdout_len=0, stderr_len=408
+2026-05-19 15:16:02 | INFO     | peft-platform | stderr (first 500): Traceback (most recent call last):
+  File "<stdin>", line 2, in <module>
+  File "/root/Fine-tuning/backend/app/services/model_service.py", line 7, in <module>
+    from app.core.db import async_session, ModelCache
+  File "/root/Fine-tuning/backend/app/core/db.py", line 3, in <module>
+    from sqlalchemy import Column, DateTime, Float, Integer, String, Text
+ModuleNotFoundError: No module named 'sqlalchemy'
+
+2026-05-19 15:16:02 | ERROR    | peft-platform | Remote model test failed: Traceback (most recent call last):
+  File "<stdin>", line 2, in <module>
+  File "/root/Fine-tuning/backend/app/services/model_service.py", line 7, in <module>
+    from app.core.db import async_session, ModelCache
+  File "/root/Fine-tuning/backend/app/core/db.py", line 3, in <module>
+    from sqlalchemy import Column, DateTime, Float, Integer, String, Text
+ModuleNotFoundError: No module named 'sqlalchemy'
+
+INFO:     172.20.0.4:52338 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request