lxylxy123321 пре 1 недеља
родитељ
комит
f6561d9063
2 измењених фајлова са 65 додато и 32 уклоњено
  1. 0 1
      backend/app/engines/remote_train.py
  2. 65 31
      result.txt

+ 0 - 1
backend/app/engines/remote_train.py

@@ -94,7 +94,6 @@ async def run_training(job_id: str, model_id: str, model_type: str, dataset_path
 
         # DEBUG: 诊断权限
         import stat
-        from pathlib import Path
         proc_dir = settings.processed_dir
         _write_log(type="debug",
                    proc_dir=str(proc_dir),

+ 65 - 31
result.txt

@@ -1,31 +1,65 @@
-(base) [root@localhost ~]# docker exec finetune-trainer id
-uid=0(root) gid=0(root) groups=0(root)
-(base) [root@localhost ~]# docker exec finetune-trainer ls -la /root/Fine-tuning/backend/data/processed/
-total 160
-drwxrwxrwx. 1 root root 4096 May 21 10:14 .
-drwxrwxrwx. 1 1000 1000 4096 May 21 10:10 ..
-drwxrwxrwx. 1 root root 4096 May 20 00:50 ._____temp
--rwxrwxrwx. 1 root root 8287 May 21 09:47 0058e329-ea10-442c-b73f-7f4c64965478_processed.jsonl
--rw-r--r--. 1 root root 8287 May 21 10:03 20cbc0c5-c946-4166-9e21-a5a96e49a0da_processed.jsonl
--rwxrwxrwx. 1 root root 8287 May 21 09:27 3296b92a-8bd0-4c26-8c9d-d17778abfa14_processed.jsonl
--rwxrwxrwx. 1 root root 8287 May 20 11:31 35949b14-2872-47b4-9fdc-b87bda7279ef_processed.jsonl
--rwxrwxrwx. 1 root root 8287 May 21 09:23 7aa39fbf-b396-422d-82c9-73044ee4397e_processed.jsonl
--rwxrwxrwx. 1 root root 8287 May 20 11:48 7bcbc0bb-72c7-408f-a4c6-c38fb05b8382_processed.jsonl
--rwxrwxrwx. 1 root root 8287 May 20 13:53 92a0a9cd-46aa-48bc-b7ad-bd5a18270c51_processed.jsonl
--rw-r--r--. 1 root root 8287 May 21 10:10 9b7a9050-d1ec-405b-a8f0-6df562437794_processed.jsonl
--rwxrwxrwx. 1 root root 8287 May 20 13:12 a52d395e-d3c8-40d2-9be3-1839f597dc7f_processed.jsonl
--rwxrwxrwx. 1 root root 8287 May 20 14:27 aa342346-a39e-4644-9a34-f3a9d3b961f8_processed.jsonl
--rwxrwxrwx. 1 root root 8287 May 20 12:03 cce886de-4dd5-460a-b0ac-2404731cd9f8_processed.jsonl
--rwxrwxrwx. 1 root root 8287 May 20 11:38 d0412da9-b6d0-4ecf-8ae9-35600353bf3e_processed.jsonl
-drwxrwxrwx. 1 root root 4096 May 20 11:31 ms_yanalong_yanalong
--rw-r--r--. 1 1000 1000    0 May 21 10:14 test_write
-(base) [root@localhost ~]# docker exec finetune-trainer touch /root/Fine-tuning/backend/data/processed/test_container && echo "容器内成功" || echo "容器内失败"
-容器内成功
-(base) [root@localhost ~]# docker inspect finetune-trainer | grep -A5 '"Mounts"'
-        "Mounts": [
-            {
-                "Type": "bind",
-                "Source": "/root/Fine-tuning/backend",
-                "Destination": "/root/Fine-tuning/backend",
-                "Mode": "",
-(base) [root@localhost ~]# 
+lq@lq:~/Fine-tuning$ sudo docker logs -f -t finetune-backend
+[sudo] password for lq: 
+2026-05-21T02:34:43.351649510Z => Syncing backend code to compute node 192.168.91.253 ...
+2026-05-21T02:34:43.398837861Z Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
+2026-05-21T02:35:01.023370182Z sending incremental file list
+2026-05-21T02:35:01.050523499Z app/engines/
+2026-05-21T02:35:01.050592451Z app/preprocessors/
+2026-05-21T02:35:01.091659254Z 
+2026-05-21T02:35:01.091745788Z sent 2,328 bytes  received 31 bytes  127.51 bytes/sec
+2026-05-21T02:35:01.091758710Z total size is 203,735  speedup is 86.36
+2026-05-21T02:35:01.093507150Z => Sync done.
+2026-05-21T02:35:02.344222594Z INFO:     Started server process [1]
+2026-05-21T02:35:02.344297685Z INFO:     Waiting for application startup.
+2026-05-21T02:35:02.434311439Z 2026-05-21 02:35:02 | INFO     | peft-platform | JobQueue started with 2 workers
+2026-05-21T02:35:02.434367300Z INFO:     Application startup complete.
+2026-05-21T02:35:02.435502488Z INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
+2026-05-21T02:35:04.147983780Z INFO:     127.0.0.1:51418 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T02:35:08.814099882Z INFO:     172.20.0.4:40850 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-21T02:35:08.839124444Z INFO:     172.20.0.4:40860 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:35:08.923924366Z INFO:     172.20.0.4:40872 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-21T02:35:10.473798949Z INFO:     172.20.0.4:40876 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:35:10.575801902Z INFO:     172.20.0.4:40892 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-21T02:35:10.589381990Z INFO:     172.20.0.4:40902 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-21T02:35:13.164806549Z 2026-05-21 02:35:13 | INFO     | peft-platform | Job 7fa42ee0-c310-4aaf-83eb-634790f9904d enqueued
+2026-05-21T02:35:13.164893550Z 2026-05-21 02:35:13 | INFO     | peft-platform | Training job created: 7fa42ee0-c310-4aaf-83eb-634790f9904d
+2026-05-21T02:35:13.165016465Z INFO:     172.20.0.4:40910 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:35:13.209751718Z 2026-05-21 02:35:13 | INFO     | peft-platform | Preprocessed 60 samples for sft/alpaca
+2026-05-21T02:36:06.256137111Z 2026-05-21 02:36:06 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
+2026-05-21T02:36:06.256252195Z 2026-05-21 02:36:06 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-21T02:36:23.951981838Z 2026-05-21 02:36:23 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-21T02:36:41.679737054Z 2026-05-21 02:36:41 | INFO     | peft-platform | Remote training launched in container: job=7fa42ee0-c310-4aaf-83eb-634790f9904d, container_pid=37
+2026-05-21T02:36:41.690100936Z [DEBUG] output_path=/root/Fine-tuning/backend/data/processed/7fa42ee0-c310-4aaf-83eb-634790f9904d_processed.jsonl
+2026-05-21T02:36:41.690253415Z [DEBUG] parent=/root/Fine-tuning/backend/data/processed, exists=True, writable=True
+2026-05-21T02:36:41.690265015Z [DEBUG] parent mode=0o40777
+2026-05-21T02:36:41.690272383Z [DEBUG] uid=0, gid=0
+2026-05-21T02:36:41.690279578Z INFO:     127.0.0.1:59794 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T02:36:41.751113159Z INFO:     127.0.0.1:34422 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T02:36:41.757340251Z INFO:     172.20.0.4:40928 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-21T02:36:41.853292345Z INFO:     172.20.0.4:40924 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:41.853552504Z INFO:     172.20.0.4:40940 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-21T02:36:41.912087522Z INFO:     172.20.0.4:54602 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:41.913050949Z INFO:     172.20.0.4:54618 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:41.918341110Z INFO:     172.20.0.4:44944 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:42.063986222Z INFO:     172.20.0.4:49000 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:42.223399081Z INFO:     172.20.0.4:49016 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:42.233469047Z INFO:     172.20.0.4:49026 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:42.331951157Z INFO:     172.20.0.4:49038 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:42.332695096Z INFO:     172.20.0.4:49042 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:42.389840552Z INFO:     172.20.0.4:49050 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:42.390568905Z INFO:     172.20.0.4:49056 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:42.391465567Z INFO:     172.20.0.4:49068 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:42.392360966Z INFO:     172.20.0.4:49072 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:42.492577438Z INFO:     172.20.0.4:49086 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:36:54.529704158Z INFO:     127.0.0.1:57198 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T02:37:17.977243955Z INFO:     172.20.0.4:43980 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:37:20.444805620Z INFO:     172.20.0.4:43994 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:37:24.715196859Z INFO:     127.0.0.1:34044 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T02:37:26.193334161Z INFO:     172.20.0.4:55586 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:37:31.187154610Z INFO:     172.20.0.4:55588 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:37:34.946677567Z 2026-05-21 02:37:34 | ERROR    | peft-platform | Remote job 7fa42ee0-c310-4aaf-83eb-634790f9904d failed: local variable 'Path' referenced before assignment
+2026-05-21T02:37:34.968928307Z 2026-05-21 02:37:34 | INFO     | peft-platform | Remote training launched for job 7fa42ee0-c310-4aaf-83eb-634790f9904d
+2026-05-21T02:37:36.185236838Z INFO:     172.20.0.4:40238 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:37:41.172814618Z INFO:     172.20.0.4:40254 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T02:37:54.917414999Z INFO:     127.0.0.1:43476 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T02:38:25.110170590Z INFO:     127.0.0.1:56426 - "GET /health HTTP/1.1" 200 OK