فهرست منبع

修改数据集下载方式

lxylxy123321 1 هفته پیش
والد
کامیت
366efe5517
2فایلهای تغییر یافته به همراه41 افزوده شده و 97 حذف شده
  1. 8 64
      backend/app/services/dataset_service.py
  2. 33 33
      result.txt

+ 8 - 64
backend/app/services/dataset_service.py

@@ -104,77 +104,21 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
 
 
 def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
-    """用 snapshot_download 下载数据集文件,完全绕过 datasets 库,避免版本兼容问题。"""
-    from modelscope import snapshot_download
+    """用 MsDataset 下载并转为 JSONL,保留之前已验证可用的逻辑。"""
+    from modelscope.msdatasets import MsDataset
 
+    ds = MsDataset.load(dataset_id)
     ds_dir = settings.processed_dir / f"ms_{dataset_id.replace('/', '_')}"
     ds_dir.mkdir(parents=True, exist_ok=True)
 
-    local_path = snapshot_download(dataset_id, cache_dir=str(settings.processed_dir))
-
-    # ModelScope 的 snapshot_download 把实际数据存到 cache_dir/downloads/<hash> 里
-    # 而 local_path 指向的目录只有元数据文件,需要额外扫描 downloads 目录
-    all_files = [p for p in Path(local_path).rglob("*") if p.is_file()]
-    downloads_dir = settings.processed_dir / "downloads"
-    if downloads_dir.exists():
-        for p in downloads_dir.rglob("*"):
-            if p.is_file() and str(p.parent) != str(ds_dir):
-                all_files.append(p)
-
-    # 识别训练数据文件
-    data_files = [f for f in all_files if _is_training_data_file(f)]
-
-    if not data_files:
-        fallback = [f for f in all_files if f.suffix in (".json", ".jsonl")]
-        logger.warning(f"No training data files found in {dataset_id}. "
-                       f"Available JSON files: {[f.name for f in fallback]}")
-        if fallback:
-            data_files = fallback
-        else:
-            raise ValueError(f"No JSON/JSONL data files found in dataset {dataset_id}")
-
-    # 优先取 train / data 开头的文件
-    target = None
-    for name in ("train.jsonl", "train.json", "data.jsonl", "data.json"):
-        for f in data_files:
-            if f.name == name:
-                target = f
-                break
-        if target:
-            break
-    if not target:
-        # 优先取数据量最大的文件
-        target = sorted(data_files, key=lambda f: f.stat().st_size, reverse=True)[0]
-
-    logger.info(f"Selected data file: {target} (size={target.stat().st_size})")
-
-    # 读取并统一转为 JSONL
+    # 取第一个 split(优先 train)
+    split_key = "train" if "train" in ds else list(ds.keys())[0]
+    split = ds[split_key]
+
     jsonl_path = ds_dir / "data.jsonl"
     record_count = 0
-    content = target.read_text(encoding="utf-8")
-
-    if target.suffix == ".jsonl" or not target.suffix:
-        # JSONL 或无后缀文件:尝试逐行解析
-        records = []
-        for line in content.splitlines():
-            line = line.strip()
-            if not line:
-                continue
-            try:
-                records.append(json.loads(line))
-            except json.JSONDecodeError:
-                # 如果逐行解析失败,尝试整体解析(可能是 JSON 数组)
-                records = json.loads(content)
-                if not isinstance(records, list):
-                    records = [records]
-                break
-    else:
-        records = json.loads(content)
-        if not isinstance(records, list):
-            records = [records]
-
     with open(jsonl_path, "w", encoding="utf-8") as f:
-        for item in records:
+        for item in split:
             f.write(json.dumps(item, ensure_ascii=False) + "\n")
             record_count += 1
 

+ 33 - 33
result.txt

@@ -1,33 +1,33 @@
-INFO:     172.19.0.3:52548 - "POST /api/v1/datasets/download HTTP/1.0" 200 OK
-INFO:     127.0.0.1:46426 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.19.0.3:48310 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.19.0.3:48320 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.19.0.3:48332 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-15 17:24:03 | INFO     | peft-platform | Job 5999c2df-0b6a-4ec2-a99a-9894ef923a85 enqueued
-2026-05-15 17:24:03 | INFO     | peft-platform | Training job created: 5999c2df-0b6a-4ec2-a99a-9894ef923a85
-INFO:     172.19.0.3:48340 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-15 17:24:03 | INFO     | peft-platform | Preprocessed 60 samples for sft/alpaca
-INFO:     172.19.0.3:48356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.19.0.3:48362 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.19.0.3:48360 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-2026-05-15 17:24:13 | INFO     | peft-platform | CUDA available: True
-2026-05-15 17:24:13 | INFO     | peft-platform | CUDA device count: 1
-2026-05-15 17:24:13 | INFO     | peft-platform | GPU 0: MetaX N260
-2026-05-15 17:24:13 | INFO     | peft-platform | GPU 0 memory: 63.78 GB
-[transformers] `torch_dtype` is deprecated! Use `dtype` instead!
-2026-05-15 17:24:14 | WARNING  | fla.utils | Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
-2026-05-15 17:24:14 | WARNING  | fla.utils | Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
-2026-05-15 17:24:20 | WARNING  | fla.ops.rwkv7.fused_addcmul | torch.compile is not available in Python 3.10, using identity decorator instead
-/opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-  warnings.warn(_BETA_TRANSFORMS_WARNING)
-/opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-  warnings.warn(_BETA_TRANSFORMS_WARNING)
-Loading weights: 100%|██████████| 320/320 [00:00<00:00, 382.46it/s]
-2026-05-15 17:24:21 | INFO     | peft-platform | Loaded model: Qwen/Qwen3.5-0.8B
-Map: 100%|██████████| 60/60 [00:00<00:00, 2212.59 examples/s]
-/opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
-  warnings.warn(msg)
-[transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
-trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
-  0%|          | 0/12 [00:00<?, ?it/s]2026-05-15 17:27:03 | ERROR    | peft-platform | Training failed for job 5999c2df-0b6a-4ec2-a99a-9894ef923a85: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
-2026-05-15 17:27:03 | ERROR    | peft-platform | Job 5999c2df-0b6a-4ec2-a99a-9894ef923a85 failed: out of resource: shared memory, Required: 106496, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.
+(base) [root@localhost Fine-tuning]# git pull
+remote: Enumerating objects: 42, done.
+remote: Counting objects: 100% (42/42), done.
+remote: Compressing objects: 100% (23/23), done.
+remote: Total 23 (delta 17), reused 0 (delta 0), pack-reused 0 (from 0)
+Unpacking objects: 100% (23/23), 6.03 KiB | 561.00 KiB/s, done.
+From http://47.109.151.80:15030/Maas2-group/Fine-tuning
+   effc062..06a515a  main       -> origin/main
+Updating effc062..06a515a
+error: Your local changes to the following files would be overwritten by merge:
+	frontend/dist/index.html
+Please commit your changes or stash them before you merge.
+Aborting
+(base) [root@localhost Fine-tuning]# git status
+On branch main
+Your branch is behind 'origin/main' by 2 commits, and can be fast-forwarded.
+  (use "git pull" to update your local branch)
+
+Changes not staged for commit:
+  (use "git add/rm <file>..." to update what will be committed)
+  (use "git restore <file>..." to discard changes in working directory)
+	deleted:    frontend/dist/assets/index-BuI1P6s7.js
+	modified:   frontend/dist/index.html
+	modified:   frontend/tsconfig.tsbuildinfo
+
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+	backend/app/main.py
+	backend/uv.lock
+	data/
+	frontend/dist/assets/index-BMiDKhk1.js
+
+no changes added to commit (use "git add" and/or "git commit -a")