Просмотр исходного кода

修复标注平台数据集下载

lxylxy123321 7 часов назад
Родитель
Сommit
0603b4a1c1
2 измененных файлов с 101 добавлено и 89 удалено
  1. 78 76
      backend/app/services/annotation_platform_service.py
  2. 23 13
      result.txt

+ 78 - 76
backend/app/services/annotation_platform_service.py

@@ -219,101 +219,103 @@ async def import_project_dataset(
     3. 保存到 uploads 目录
     4. 写入 DatasetRecord 数据库
     """
-    # 1. 请求导出(先尝试只导出已完成,若为空则回退导出全部)
-    export_data = await _request(
-        "POST",
-        f"/api/v1/open/projects/{project_id}/datasets/download",
-        json={"format": format, "completed_only": True},
-    )
-
-    file_url = export_data.get("file_url", "")
-    file_name = export_data.get("file_name", f"{project_id}_{format}.json")
-    total_exported = export_data.get("total_exported", 0)
+    # 尝试多种格式导出,某些项目可能不兼容 alpaca 格式
+    formats_to_try = [format]
+    if format != "json":
+        formats_to_try.append("json")
+    if format not in ("raw", "original"):
+        formats_to_try.append("raw")
 
-    logger.info(
-        f"Annotation export (completed_only=True): total_exported={total_exported}, "
-        f"file_url={file_url}, file_name={file_name}"
-    )
+    file_content = b""
+    file_name = ""
+    total_exported = 0
+    used_format = ""
 
-    # 如果已完成数据为空,回退导出全部(包括未完成的)
-    if total_exported == 0 or not file_url:
-        logger.info(f"No completed items found, retrying with completed_only=False")
+    for try_format in formats_to_try:
+        # 1. 请求导出
         export_data = await _request(
             "POST",
             f"/api/v1/open/projects/{project_id}/datasets/download",
-            json={"format": format, "completed_only": False},
+            json={"format": try_format, "completed_only": True},
         )
-        file_url = export_data.get("file_url", "")
-        file_name = export_data.get("file_name", f"{project_id}_{format}.json")
-        total_exported = export_data.get("total_exported", 0)
+
         logger.info(
-            f"Annotation export (completed_only=False): total_exported={total_exported}, "
-            f"file_url={file_url}, file_name={file_name}"
+            f"Annotation export response (format={try_format}): {export_data}"
         )
 
-    if not file_url:
-        raise RuntimeError("标注平台未返回下载链接")
+        file_url = export_data.get("file_url", "")
+        file_name = export_data.get("file_name", f"{project_id}_{try_format}.json")
+        total_exported = export_data.get("total_exported", 0)
 
-    # 2. 从 file_url 中提取 download_token
-    # file_url 格式如: /api/v1/open/datasets/downloads/dl_abc123
-    if "/datasets/downloads/" in file_url:
-        download_token = file_url.split("/datasets/downloads/")[-1].strip("/")
-    else:
-        # 兜底:直接使用 file_url 的最后一段
-        download_token = file_url.rstrip("/").split("/")[-1]
+        if not file_url:
+            logger.warning(f"No file_url for format={try_format}, trying next...")
+            continue
 
-    # 3. 通过独立的下载接口获取文件(文档 4.6 节)
-    # 标注平台导出是异步的,文件可能还没生成,需要轮询
-    await get_token()
-    base_url = _get_base_url()
-    download_url = f"{base_url}/api/v1/open/datasets/downloads/{download_token}"
+        # 2. 从 file_url 中提取 download_token
+        if "/datasets/downloads/" in file_url:
+            download_token = file_url.split("/datasets/downloads/")[-1].strip("/")
+        else:
+            download_token = file_url.rstrip("/").split("/")[-1]
 
-    file_content = b""
-    max_retries = 6
-    for attempt in range(max_retries):
-        async with httpx.AsyncClient(timeout=120) as client:
-            resp = await client.get(
-                download_url,
-                headers=_auth_headers(),
-                follow_redirects=False,
-            )
-            # 手动跟随重定向,每次都带上认证头
-            redirect_count = 0
-            while resp.is_redirect and redirect_count < 5:
-                redirect_url = resp.next_request.url
-                logger.info(f"Download redirect to: {redirect_url}")
+        # 3. 下载文件,带轮询(标注平台生成文件可能需要时间)
+        await get_token()
+        base_url = _get_base_url()
+        download_url = f"{base_url}/api/v1/open/datasets/downloads/{download_token}"
+
+        file_content = b""
+        max_retries = 4
+        for attempt in range(max_retries):
+            async with httpx.AsyncClient(timeout=120) as client:
                 resp = await client.get(
-                    str(redirect_url),
+                    download_url,
                     headers=_auth_headers(),
                     follow_redirects=False,
                 )
-                redirect_count += 1
-            resp.raise_for_status()
-            file_content = resp.content
+                redirect_count = 0
+                while resp.is_redirect and redirect_count < 5:
+                    redirect_url = resp.next_request.url
+                    logger.info(f"Download redirect to: {redirect_url}")
+                    resp = await client.get(
+                        str(redirect_url),
+                        headers=_auth_headers(),
+                        follow_redirects=False,
+                    )
+                    redirect_count += 1
+                resp.raise_for_status()
+                file_content = resp.content
+
+            if len(file_content) > 10:
+                break
+
+            if attempt < max_retries - 1:
+                import asyncio
+                wait = 2 ** attempt  # 1, 2, 4 秒
+                logger.info(
+                    f"Download attempt {attempt + 1}/{max_retries} (format={try_format}): "
+                    f"file too small ({len(file_content)} bytes), retrying in {wait}s..."
+                )
+                await asyncio.sleep(wait)
+
+        logger.info(
+            f"Downloaded (format={try_format}): {len(file_content)} bytes, "
+            f"content_type={resp.headers.get('content-type', 'unknown')}"
+        )
 
-        # 文件内容足够大,说明下载成功
         if len(file_content) > 10:
+            used_format = try_format
             break
 
-        # 内容为空或太小(如 [] 或 {}),文件可能还没生成
-        if attempt < max_retries - 1:
-            wait = 2 ** attempt  # 1, 2, 4, 8, 16 秒
-            logger.info(
-                f"Download attempt {attempt + 1}/{max_retries}: "
-                f"file too small ({len(file_content)} bytes), retrying in {wait}s..."
-            )
-            import asyncio
-            await asyncio.sleep(wait)
-        else:
-            logger.warning(f"Download failed after {max_retries} attempts, file still empty")
+        logger.warning(
+            f"Format '{try_format}' returned empty file ({len(file_content)} bytes), "
+            f"trying next format..."
+        )
 
-    logger.info(
-        f"Downloaded annotation file: {len(file_content)} bytes, "
-        f"content_type={resp.headers.get('content-type', 'unknown')}, "
-        f"redirects={redirect_count}"
-    )
-    if len(file_content) < 200:
-        logger.warning(f"Annotation file content suspiciously small: {file_content!r}")
+    if len(file_content) <= 10:
+        logger.warning(f"Annotation file content: {file_content!r}")
+        raise RuntimeError(
+            f"标注平台导出文件为空(尝试了格式: {formats_to_try}),"
+            f"total_exported={total_exported},请检查标注平台项目状态"
+        )
 
     # 4. 保存到 uploads 目录
     upload_dir = settings.uploads_dir
@@ -326,11 +328,11 @@ async def import_project_dataset(
 
     file_path.write_bytes(file_content)
 
-    # 5. 统一转为 JSONL 格式(和 ModelScope/HF 下载的数据格式一致)
+    # 5. 统一转为 JSONL 格式
     jsonl_path = _convert_to_jsonl(file_path)
     record_count = _count_records(jsonl_path, "jsonl")
 
-    logger.info(f"Annotation file converted: {jsonl_path.name}, record_count={record_count}")
+    logger.info(f"Annotation file converted: {jsonl_path.name}, record_count={record_count}, format={used_format}")
 
     # 6. 写入数据库(格式统一为 jsonl)
     record_id = str(uuid.uuid4())

+ 23 - 13
result.txt

@@ -1,13 +1,23 @@
-INFO:     172.20.0.4:37122 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-2026-05-28 08:29:33 | INFO     | httpx | HTTP Request: POST http://192.168.92.61:8003/api/v1/open/auth/token "HTTP/1.1 200 OK"
-2026-05-28 08:29:33 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/projects?page=1&page_size=20 "HTTP/1.1 200 OK"
-INFO:     172.20.0.4:37134 - "GET /api/v1/annotation-platform/projects?page=1&page_size=20 HTTP/1.0" 200 OK
-2026-05-28 08:29:34 | INFO     | httpx | HTTP Request: POST http://192.168.92.61:8003/api/v1/open/projects/proj_2e8e2373469c/datasets/download "HTTP/1.1 200 OK"
-2026-05-28 08:29:34 | INFO     | peft-platform | Annotation export (completed_only=True): total_exported=4, file_url=/api/v1/open/datasets/downloads/dl_5078fe3a1cb6, file_name=proj_2e8e2373469c_alpaca_20260528_082934.json
-2026-05-28 08:29:34 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/datasets/downloads/dl_5078fe3a1cb6 "HTTP/1.1 200 OK"
-2026-05-28 08:29:35 | INFO     | peft-platform | Downloaded annotation file: 2 bytes, content_type=application/json, url=http://192.168.92.61:8003/api/v1/open/datasets/downloads/dl_5078fe3a1cb6, redirects=0
-2026-05-28 08:29:35 | WARNING  | peft-platform | Annotation file content suspiciously small: b'[]'
-2026-05-28 08:29:35 | INFO     | peft-platform | Annotation file converted: 计算机实体标注_.jsonl, record_count=0
-2026-05-28 08:29:35 | INFO     | peft-platform | Imported dataset from annotation platform: proj_2e8e2373469c -> 计算机实体标注_.jsonl (0 records)
-INFO:     172.20.0.4:41034 - "POST /api/v1/annotation-platform/projects/proj_2e8e2373469c/import?project_name=%E8%AE%A1%E7%AE%97%E6%9C%BA%E5%AE%9E%E4%BD%93%E6%A0%87%E6%B3%A8%20&format=alpaca HTTP/1.0" 200 OK
-INFO:     172.20.0.4:41044 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-28 09:01:22 | INFO     | httpx | HTTP Request: POST http://192.168.92.61:8003/api/v1/open/auth/token "HTTP/1.1 200 OK"
+2026-05-28 09:01:22 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/projects?page=1&page_size=20 "HTTP/1.1 200 OK"
+INFO:     172.20.0.4:40246 - "GET /api/v1/annotation-platform/projects?page=1&page_size=20 HTTP/1.0" 200 OK
+2026-05-28 09:01:24 | INFO     | httpx | HTTP Request: POST http://192.168.92.61:8003/api/v1/open/projects/proj_2e8e2373469c/datasets/download "HTTP/1.1 200 OK"
+2026-05-28 09:01:24 | INFO     | peft-platform | Annotation export (completed_only=True): total_exported=4, file_url=/api/v1/open/datasets/downloads/dl_692c3a34d6fa, file_name=proj_2e8e2373469c_alpaca_20260528_090124.json
+2026-05-28 09:01:24 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/datasets/downloads/dl_692c3a34d6fa "HTTP/1.1 200 OK"
+2026-05-28 09:01:24 | INFO     | peft-platform | Download attempt 1/6: file too small (2 bytes), retrying in 1s...
+2026-05-28 09:01:25 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/datasets/downloads/dl_692c3a34d6fa "HTTP/1.1 200 OK"
+2026-05-28 09:01:25 | INFO     | peft-platform | Download attempt 2/6: file too small (2 bytes), retrying in 2s...
+INFO:     127.0.0.1:46024 - "GET /health HTTP/1.1" 200 OK
+2026-05-28 09:01:27 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/datasets/downloads/dl_692c3a34d6fa "HTTP/1.1 200 OK"
+2026-05-28 09:01:27 | INFO     | peft-platform | Download attempt 3/6: file too small (2 bytes), retrying in 4s...
+2026-05-28 09:01:31 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/datasets/downloads/dl_692c3a34d6fa "HTTP/1.1 200 OK"
+2026-05-28 09:01:31 | INFO     | peft-platform | Download attempt 4/6: file too small (2 bytes), retrying in 8s...
+2026-05-28 09:01:39 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/datasets/downloads/dl_692c3a34d6fa "HTTP/1.1 200 OK"
+2026-05-28 09:01:39 | INFO     | peft-platform | Download attempt 5/6: file too small (2 bytes), retrying in 16s...
+2026-05-28 09:01:55 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/datasets/downloads/dl_692c3a34d6fa "HTTP/1.1 200 OK"
+2026-05-28 09:01:55 | WARNING  | peft-platform | Download failed after 6 attempts, file still empty
+2026-05-28 09:01:55 | INFO     | peft-platform | Downloaded annotation file: 2 bytes, content_type=application/json, redirects=0
+2026-05-28 09:01:55 | WARNING  | peft-platform | Annotation file content suspiciously small: b'[]'
+2026-05-28 09:01:55 | INFO     | peft-platform | Annotation file converted: eaacdd59_计算机实体标注_.jsonl, record_count=0
+2026-05-28 09:01:55 | INFO     | peft-platform | Imported dataset from annotation platform: proj_2e8e2373469c -> eaacdd59_计算机实体标注_.jsonl (0 records)
+INFO:     172.20.0.4:59282 - "POST /api/v1/annotation-platform/projects/proj_2e8e2373469c/import?project_name=%E8%AE%A1%E7%AE%97%E6%9C%BA%E5%AE%9E%E4%BD%93%E6%A0%87%E6%B3%A8%20&format=alpaca HTTP/1.0" 200 OK