Bladeren bron

修复数据集问题

lxylxy123321 1 week geleden
bovenliggende
commit
34f115a07b
3 gewijzigde bestanden met toevoegingen van 67 en 63 verwijderingen
  1. 17 28
      backend/app/services/dataset_service.py
  2. 25 11
      backend/app/services/model_test_service.py
  3. 25 24
      result.txt

+ 17 - 28
backend/app/services/dataset_service.py

@@ -104,37 +104,23 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
 
 
 def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
-    """用 snapshot_download 下载数据集文件,完全绕过 datasets 库,避免版本兼容问题。"""
-    from modelscope import snapshot_download
+    """用 modelscope CLI 下载数据集,完全绕过 datasets 库,避免版本兼容问题。"""
+    import subprocess
 
     ds_dir = settings.processed_dir / f"ms_{dataset_id.replace('/', '_')}"
     ds_dir.mkdir(parents=True, exist_ok=True)
 
-    local_path = snapshot_download(dataset_id, cache_dir=str(settings.processed_dir))
-    logger.info(f"ModelScope snapshot_download returned local_path: {local_path}")
-
-    # 收集所有文件:先扫描 local_path 本身
-    all_files = []
-    if Path(local_path).exists():
-        all_files = [p for p in Path(local_path).rglob("*") if p.is_file()]
-        logger.info(f"Found {len(all_files)} files in local_path")
-
-    # ModelScope 可能把实际文件存到 cache_dir/downloads/<hash> 或 cache_dir/hub 里
-    # 额外扫描几个可能的目录
-    for subdir_name in ("downloads", "hub", f"models/{dataset_id}"):
-        extra_dir = settings.processed_dir / subdir_name
-        if extra_dir.exists():
-            for p in extra_dir.rglob("*"):
-                if p.is_file() and p not in all_files:
-                    all_files.append(p)
-            logger.info(f"Found additional files in {subdir_name}: {[p.name for p in all_files if str(p.parent).startswith(str(extra_dir))]}")
-
-    # 如果 local_path 里什么也没找到,尝试扫描整个 processed_dir
-    if not all_files:
-        logger.warning(f"No files found in local_path, scanning entire processed_dir")
-        for p in settings.processed_dir.rglob("*"):
-            if p.is_file() and p not in all_files:
-                all_files.append(p)
+    # 使用 CLI 方式下载,避免 snapshot_download API 的路径问题
+    cmd = ["modelscope", "download", "--dataset", dataset_id, "--local_dir", str(ds_dir)]
+    logger.info(f"Running: {' '.join(cmd)}")
+    proc = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
+    if proc.returncode != 0:
+        logger.error(f"ModelScope CLI download failed (code={proc.returncode}): {proc.stderr[:500]}")
+        raise RuntimeError(f"ModelScope download failed: {proc.stderr[:500]}")
+
+    # 扫描下载目录中的所有文件
+    all_files = [p for p in ds_dir.rglob("*") if p.is_file()]
+    logger.info(f"ModelScope CLI downloaded {len(all_files)} files to {ds_dir}")
 
     # 识别训练数据文件
     data_files = [f for f in all_files if _is_training_data_file(f)]
@@ -146,7 +132,10 @@ def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
         if fallback:
             data_files = fallback
         else:
-            raise ValueError(f"No JSON/JSONL data files found in dataset {dataset_id}")
+            # 如果还是没有,列出所有文件供排查
+            logger.error(f"All downloaded files: {[str(f.relative_to(ds_dir)) for f in all_files]}")
+            raise ValueError(f"No JSON/JSONL data files found in dataset {dataset_id}. "
+                             f"Available files: {[f.name for f in all_files]}")
 
     # 优先取 train / data 开头的文件
     target = None

+ 25 - 11
backend/app/services/model_test_service.py

@@ -81,27 +81,41 @@ if model_path is None:
 t = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 t.pad_token = t.pad_token or t.eos_token
 
+# 判断 accelerate 是否可用,决定加载策略
+has_accelerate = False
+try:
+    import accelerate
+    has_accelerate = True
+except ImportError:
+    pass
+
 m = None
 load_errors = []
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+
 for cls, kw in [(AutoModelForCausalLM, {{'trust_remote_code': True}}), (AutoModel, {{'trust_remote_code': True}})]:
-    try:
-        m = cls.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', **kw)
-        break
-    except Exception as e:
-        load_errors.append(f'{{cls.__name__}} float16: {{str(e)[:200]}}')
-    # float16 失败时尝试 float32
-    try:
-        m = cls.from_pretrained(model_path, torch_dtype=torch.float32, device_map='auto', **kw)
+    for dtype_val, dtype_name in [(torch.float16, 'float16'), (torch.float32, 'float32')]:
+        try:
+            if has_accelerate:
+                # 有 accelerate,用 device_map='auto' 自动分配
+                m = cls.from_pretrained(model_path, dtype=dtype_val, device_map='auto', **kw)
+            else:
+                # 没有 accelerate,手动加载到单卡
+                m = cls.from_pretrained(model_path, dtype=dtype_val, device_map=None, **kw)
+                m = m.to(device)
+            break
+        except Exception as e:
+            load_errors.append(f'{{cls.__name__}} {{dtype_name}}: {{str(e)[:200]}}')
+    if m is not None:
         break
-    except Exception as e:
-        load_errors.append(f'{{cls.__name__}} float32: {{str(e)[:200]}}')
 
 if m is None:
     print(json.dumps({{'error': 'Unable to load model', 'details': load_errors}}))
     exit(1)
 
 m.eval()
-inp = t(prompt, return_tensors='pt').to(m.device)
+device = next(m.parameters()).device
+inp = t(prompt, return_tensors='pt').to(device)
 out = m.generate(**inp, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=do_sample, pad_token_id=t.eos_token_id)
 gen = t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)
 print(json.dumps({{'generated_text': gen}}))

+ 25 - 24
result.txt

@@ -1,24 +1,25 @@
-lq@lq:~$ sudo docker logs -f finetune-backend
-INFO:     Started server process [1]
-INFO:     Waiting for application startup.
-2026-05-19 16:49:51 | INFO     | peft-platform | JobQueue started with 2 workers
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
-INFO:     127.0.0.1:58862 - "GET /health HTTP/1.1" 200 OK
-Downloading: 100%|██████████| 36.0/36.0 [00:00<00:00, 114B/s]
-Downloading: 100%|██████████| 1.35k/1.35k [00:00<00:00, 4.58kB/s]
-2026-05-19 16:50:05 | WARNING  | peft-platform | No training data files found in yanalong/yanalong. Available JSON files: ['configuration.json']
-2026-05-19 16:50:05 | INFO     | peft-platform | Selected data file: /root/Fine-tuning/backend/data/processed/yanalong/yanalong/configuration.json (size=36)
-2026-05-19 16:50:05 | ERROR    | peft-platform | Dataset download failed: (sqlalchemy.dialects.postgresql.asyncpg.Error) <class 'asyncpg.exceptions.DataError'>: invalid input for query argument $6: datetime.datetime(2026, 5, 19, 16, 50, 5... (can't subtract offset-naive and offset-aware datetimes)
-[SQL: INSERT INTO datasets (id, name, format, record_count, file_path, created_at) VALUES ($1::VARCHAR, $2::VARCHAR, $3::VARCHAR, $4::INTEGER, $5::VARCHAR, $6::TIMESTAMP WITHOUT TIME ZONE)]
-[parameters: ('8c678763-5fb3-4556-90ab-fe2abf10f881', 'yanalong/yanalong', 'jsonl', 1, '/root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl', datetime.datetime(2026, 5, 19, 16, 50, 5, 151652, tzinfo=datetime.timezone.utc))]
-(Background on this error at: https://sqlalche.me/e/20/dbapi)
-INFO:     172.20.0.4:58412 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
-INFO:     172.20.0.4:56082 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-2026-05-19 16:50:51 | INFO     | peft-platform | Remote test result: code=1, stdout_len=34, stderr_len=0
-2026-05-19 16:50:51 | INFO     | peft-platform | stdout (first 500): {"error": "Unable to load model"}
-
-2026-05-19 16:50:51 | ERROR    | peft-platform | Remote model test failed: 
-INFO:     172.20.0.4:56096 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
-INFO:     127.0.0.1:39748 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:43876 - "GET /health HTTP/1.1" 200 OK
+finetune-backend  | 2026-05-20 02:02:04 | INFO     | peft-platform | Remote test result: code=1, stdout_len=885, stderr_len=1764
+finetune-backend  | 2026-05-20 02:02:04 | INFO     | peft-platform | stdout (first 500): {"error": "Unable to load model", "details": ["AutoModelForCausalLM float16: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`", "AutoModelForCausalLM float32: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`", "AutoModel float16: Usin
+finetune-backend  | 2026-05-20 02:02:04 | INFO     | peft-platform | stderr (first 500): [transformers] `torch_dtype` is deprecated! Use `dtype` instead!
+finetune-backend  | Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
+finetune-backend  | Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
+finetune-backend  | torch.compile is not available in Python 3.10, using identity decorator instead
+finetune-backend  | /opt/conda/lib/python3.10/site-packages/torchvision/d
+finetune-backend  | 2026-05-20 02:02:04 | ERROR    | peft-platform | Remote model test failed: [transformers] `torch_dtype` is deprecated! Use `dtype` instead!
+finetune-backend  | Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
+finetune-backend  | Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
+finetune-backend  | torch.compile is not available in Python 3.10, using identity decorator instead
+finetune-backend  | /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+finetune-backend  |   warnings.warn(_BETA_TRANSFORMS_WARNING)
+finetune-backend  | /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+finetune-backend  |   warnings.warn(_BETA_TRANSFORMS_WARNING)
+finetune-backend  | 
+finetune-backend  | INFO:     172.20.0.4:60592 - "POST /api/v1/models/test HTTP/1.0" 400 Bad Request
+finetune-backend  | INFO:     127.0.0.1:50720 - "GET /health HTTP/1.1" 200 OK
+finetune-backend  | INFO:     127.0.0.1:47024 - "GET /health HTTP/1.1" 200 OK
+finetune-backend  | INFO:     127.0.0.1:37202 - "GET /health HTTP/1.1" 200 OK
+finetune-backend  | 2026-05-20 02:02:30 | INFO     | peft-platform | ModelScope snapshot_download returned local_path: /root/Fine-tuning/backend/data/processed/yanalong/yanalong
+finetune-backend  | 2026-05-20 02:02:30 | INFO     | peft-platform | Found 5 files in local_path
+finetune-backend  | 2026-05-20 02:02:30 | WARNING  | peft-platform | No training data files found in yanalong/yanalong. Available JSON files: []
+finetune-backend  | 2026-05-20 02:02:30 | ERROR    | peft-platform | Dataset download failed: No JSON/JSONL data files found in dataset yanalong/yanalong
+finetune-backend  | INFO:     172.20.0.4:54076 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request