Selaa lähdekoodia

修改数据集下载方式

lxylxy123321 1 viikko sitten
vanhempi
sitoutus
3be17afc90
2 muutettua tiedostoa jossa 59 lisäystä ja 39 poistoa
  1. 10 6
      backend/app/services/dataset_service.py
  2. 49 33
      result.txt

+ 10 - 6
backend/app/services/dataset_service.py

@@ -104,21 +104,25 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
 
 
 def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
-    """用 MsDataset 下载并转为 JSONL,保留之前已验证可用的逻辑。"""
+    """用 MsDataset 下载并转为 JSONL。"""
     from modelscope.msdatasets import MsDataset
 
-    ds = MsDataset.load(dataset_id)
+    try:
+        ds = MsDataset.load(dataset_id, subset_name="default", split="train")
+    except Exception:
+        # 回退:不带参数,自动选择第一个 split
+        ds = MsDataset.load(dataset_id)
+
     ds_dir = settings.processed_dir / f"ms_{dataset_id.replace('/', '_')}"
     ds_dir.mkdir(parents=True, exist_ok=True)
 
-    # 取第一个 split(优先 train)
-    split_key = "train" if "train" in ds else list(ds.keys())[0]
-    split = ds[split_key]
+    # 如果是 DatasetDict,取第一个 split
+    split_data = ds if not hasattr(ds, "keys") else ds[list(ds.keys())[0]]
 
     jsonl_path = ds_dir / "data.jsonl"
     record_count = 0
     with open(jsonl_path, "w", encoding="utf-8") as f:
-        for item in split:
+        for item in split_data:
             f.write(json.dumps(item, ensure_ascii=False) + "\n")
             record_count += 1
 

+ 49 - 33
result.txt

@@ -1,33 +1,49 @@
-(base) [root@localhost Fine-tuning]# git pull
-remote: Enumerating objects: 42, done.
-remote: Counting objects: 100% (42/42), done.
-remote: Compressing objects: 100% (23/23), done.
-remote: Total 23 (delta 17), reused 0 (delta 0), pack-reused 0 (from 0)
-Unpacking objects: 100% (23/23), 6.03 KiB | 561.00 KiB/s, done.
-From http://47.109.151.80:15030/Maas2-group/Fine-tuning
-   effc062..06a515a  main       -> origin/main
-Updating effc062..06a515a
-error: Your local changes to the following files would be overwritten by merge:
-	frontend/dist/index.html
-Please commit your changes or stash them before you merge.
-Aborting
-(base) [root@localhost Fine-tuning]# git status
-On branch main
-Your branch is behind 'origin/main' by 2 commits, and can be fast-forwarded.
-  (use "git pull" to update your local branch)
-
-Changes not staged for commit:
-  (use "git add/rm <file>..." to update what will be committed)
-  (use "git restore <file>..." to discard changes in working directory)
-	deleted:    frontend/dist/assets/index-BuI1P6s7.js
-	modified:   frontend/dist/index.html
-	modified:   frontend/tsconfig.tsbuildinfo
-
-Untracked files:
-  (use "git add <file>..." to include in what will be committed)
-	backend/app/main.py
-	backend/uv.lock
-	data/
-	frontend/dist/assets/index-BMiDKhk1.js
-
-no changes added to commit (use "git add" and/or "git commit -a")
+(base) [root@localhost Fine-tuning]# docker exec finetune-backend pip install --upgrade datasets
+Looking in indexes: http://mirrors.aliyun.com/pypi/simple
+Requirement already satisfied: datasets in /opt/conda/lib/python3.10/site-packages (4.8.5)
+Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from datasets) (3.29.0)
+Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from datasets) (1.26.4)
+Requirement already satisfied: pyarrow>=21.0.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (24.0.0)
+Requirement already satisfied: dill<0.4.2,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.4.1)
+Requirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (from datasets) (2.3.3)
+Requirement already satisfied: requests>=2.32.2 in /opt/conda/lib/python3.10/site-packages (from datasets) (2.32.3)
+Requirement already satisfied: httpx<1.0.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.28.1)
+Requirement already satisfied: tqdm>=4.66.3 in /opt/conda/lib/python3.10/site-packages (from datasets) (4.67.1)
+Requirement already satisfied: xxhash in /opt/conda/lib/python3.10/site-packages (from datasets) (3.7.0)
+Requirement already satisfied: multiprocess<0.70.20 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.70.19)
+Requirement already satisfied: fsspec<=2026.2.0,>=2023.1.0 in /opt/conda/lib/python3.10/site-packages (from fsspec[http]<=2026.2.0,>=2023.1.0->datasets) (2025.5.1)
+Requirement already satisfied: huggingface-hub<2.0,>=0.25.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (1.14.0)
+Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from datasets) (26.2)
+Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from datasets) (6.0.3)
+Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /opt/conda/lib/python3.10/site-packages (from fsspec[http]<=2026.2.0,>=2023.1.0->datasets) (3.13.5)
+Requirement already satisfied: anyio in /opt/conda/lib/python3.10/site-packages (from httpx<1.0.0->datasets) (4.13.0)
+Requirement already satisfied: certifi in /opt/conda/lib/python3.10/site-packages (from httpx<1.0.0->datasets) (2026.4.22)
+Requirement already satisfied: httpcore==1.* in /opt/conda/lib/python3.10/site-packages (from httpx<1.0.0->datasets) (1.0.9)
+Requirement already satisfied: idna in /opt/conda/lib/python3.10/site-packages (from httpx<1.0.0->datasets) (3.10)
+Requirement already satisfied: h11>=0.16 in /opt/conda/lib/python3.10/site-packages (from httpcore==1.*->httpx<1.0.0->datasets) (0.16.0)
+Requirement already satisfied: hf-xet<2.0.0,>=1.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<2.0,>=0.25.0->datasets) (1.4.3)
+Requirement already satisfied: typer>=0.20.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<2.0,>=0.25.0->datasets) (0.25.0)
+Requirement already satisfied: typing-extensions>=4.1.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<2.0,>=0.25.0->datasets) (4.15.0)
+Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2026.2.0,>=2023.1.0->datasets) (2.6.1)
+Requirement already satisfied: aiosignal>=1.4.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2026.2.0,>=2023.1.0->datasets) (1.4.0)
+Requirement already satisfied: async-timeout<6.0,>=4.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2026.2.0,>=2023.1.0->datasets) (5.0.1)
+Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2026.2.0,>=2023.1.0->datasets) (26.1.0)
+Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2026.2.0,>=2023.1.0->datasets) (1.8.0)
+Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2026.2.0,>=2023.1.0->datasets) (6.7.1)
+Requirement already satisfied: propcache>=0.2.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2026.2.0,>=2023.1.0->datasets) (0.4.1)
+Requirement already satisfied: yarl<2.0,>=1.17.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2026.2.0,>=2023.1.0->datasets) (1.23.0)
+Requirement already satisfied: charset_normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (3.4.1)
+Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (2.3.0)
+Requirement already satisfied: click>=8.2.1 in /opt/conda/lib/python3.10/site-packages (from typer>=0.20.0->huggingface-hub<2.0,>=0.25.0->datasets) (8.2.1)
+Requirement already satisfied: shellingham>=1.3.0 in /opt/conda/lib/python3.10/site-packages (from typer>=0.20.0->huggingface-hub<2.0,>=0.25.0->datasets) (1.5.4)
+Requirement already satisfied: rich>=13.8.0 in /opt/conda/lib/python3.10/site-packages (from typer>=0.20.0->huggingface-hub<2.0,>=0.25.0->datasets) (15.0.0)
+Requirement already satisfied: annotated-doc>=0.0.2 in /opt/conda/lib/python3.10/site-packages (from typer>=0.20.0->huggingface-hub<2.0,>=0.25.0->datasets) (0.0.4)
+Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/conda/lib/python3.10/site-packages (from rich>=13.8.0->typer>=0.20.0->huggingface-hub<2.0,>=0.25.0->datasets) (4.0.0)
+Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from rich>=13.8.0->typer>=0.20.0->huggingface-hub<2.0,>=0.25.0->datasets) (2.19.2)
+Requirement already satisfied: mdurl~=0.1 in /opt/conda/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=13.8.0->typer>=0.20.0->huggingface-hub<2.0,>=0.25.0->datasets) (0.1.2)
+Requirement already satisfied: exceptiongroup>=1.0.2 in /opt/conda/lib/python3.10/site-packages (from anyio->httpx<1.0.0->datasets) (1.3.0)
+Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2.9.0.post0)
+Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2026.1.post1)
+Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2026.2)
+Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.