lxylxy123321 5 часов назад
Родитель
Сommit
c3fcc0a9f7
2 измененных файлов с 49 добавлено и 29 удалено
  1. 36 17
      backend/app/services/annotation_platform_service.py
  2. 13 12
      result.txt

+ 36 - 17
backend/app/services/annotation_platform_service.py

@@ -263,35 +263,54 @@ async def import_project_dataset(
         download_token = file_url.rstrip("/").split("/")[-1]
 
     # 3. 通过独立的下载接口获取文件(文档 4.6 节)
+    # 标注平台导出是异步的,文件可能还没生成,需要轮询
     await get_token()
     base_url = _get_base_url()
     download_url = f"{base_url}/api/v1/open/datasets/downloads/{download_token}"
 
-    async with httpx.AsyncClient(timeout=120) as client:
-        # 先手动处理重定向,确保每次请求都带上认证头
-        resp = await client.get(
-            download_url,
-            headers=_auth_headers(),
-            follow_redirects=False,
-        )
-        # 手动跟随重定向,每次都带上认证头
-        redirect_count = 0
-        while resp.is_redirect and redirect_count < 5:
-            redirect_url = resp.next_request.url
-            logger.info(f"Download redirect to: {redirect_url}")
+    file_content = b""
+    max_retries = 6
+    for attempt in range(max_retries):
+        async with httpx.AsyncClient(timeout=120) as client:
             resp = await client.get(
-                str(redirect_url),
+                download_url,
                 headers=_auth_headers(),
                 follow_redirects=False,
             )
-            redirect_count += 1
-        resp.raise_for_status()
-        file_content = resp.content
+            # 手动跟随重定向,每次都带上认证头
+            redirect_count = 0
+            while resp.is_redirect and redirect_count < 5:
+                redirect_url = resp.next_request.url
+                logger.info(f"Download redirect to: {redirect_url}")
+                resp = await client.get(
+                    str(redirect_url),
+                    headers=_auth_headers(),
+                    follow_redirects=False,
+                )
+                redirect_count += 1
+            resp.raise_for_status()
+            file_content = resp.content
+
+        # 文件内容足够大,说明下载成功
+        if len(file_content) > 10:
+            break
+
+        # 内容为空或太小(如 [] 或 {}),文件可能还没生成
+        if attempt < max_retries - 1:
+            wait = 2 ** attempt  # 1, 2, 4, 8, 16 秒
+            logger.info(
+                f"Download attempt {attempt + 1}/{max_retries}: "
+                f"file too small ({len(file_content)} bytes), retrying in {wait}s..."
+            )
+            import asyncio
+            await asyncio.sleep(wait)
+        else:
+            logger.warning(f"Download failed after {max_retries} attempts, file still empty")
 
     logger.info(
         f"Downloaded annotation file: {len(file_content)} bytes, "
         f"content_type={resp.headers.get('content-type', 'unknown')}, "
-        f"url={resp.url}, redirects={redirect_count}"
+        f"redirects={redirect_count}"
     )
     if len(file_content) < 200:
         logger.warning(f"Annotation file content suspiciously small: {file_content!r}")

+ 13 - 12
result.txt

@@ -1,12 +1,13 @@
-2026-05-28 08:26:28 | INFO     | httpx | HTTP Request: POST http://192.168.92.61:8003/api/v1/open/auth/token "HTTP/1.1 200 OK"
-2026-05-28 08:26:28 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/projects?page=1&page_size=20 "HTTP/1.1 200 OK"
-INFO:     172.20.0.4:53778 - "GET /api/v1/annotation-platform/projects?page=1&page_size=20 HTTP/1.0" 200 OK
-2026-05-28 08:26:32 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/projects/proj_2e8e2373469c "HTTP/1.1 200 OK"
-INFO:     172.20.0.4:53786 - "GET /api/v1/annotation-platform/projects/proj_2e8e2373469c HTTP/1.0" 200 OK
-2026-05-28 08:26:35 | INFO     | httpx | HTTP Request: POST http://192.168.92.61:8003/api/v1/open/projects/proj_2e8e2373469c/datasets/download "HTTP/1.1 200 OK"
-2026-05-28 08:26:35 | INFO     | peft-platform | Annotation export (completed_only=True): total_exported=4, file_url=/api/v1/open/datasets/downloads/dl_12fa17c6d874, file_name=proj_2e8e2373469c_alpaca_20260528_082635.json
-2026-05-28 08:26:35 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/datasets/downloads/dl_12fa17c6d874 "HTTP/1.1 200 OK"
-2026-05-28 08:26:35 | INFO     | peft-platform | Downloaded annotation file: 2 bytes, content_type=application/json
-2026-05-28 08:26:35 | INFO     | peft-platform | Annotation file converted: 计算机实体标注_.jsonl, record_count=0
-2026-05-28 08:26:35 | INFO     | peft-platform | Imported dataset from annotation platform: proj_2e8e2373469c -> 计算机实体标注_.jsonl (0 records)
-INFO:     172.20.0.4:49930 - "POST /api/v1/annotation-platform/projects/proj_2e8e2373469c/import?project_name=%E8%AE%A1%E7%AE%97%E6%9C%BA%E5%AE%9E%E4%BD%93%E6%A0%87%E6%B3%A8%20&format=alpaca HTTP/1.0" 200 OK
+INFO:     172.20.0.4:37122 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-28 08:29:33 | INFO     | httpx | HTTP Request: POST http://192.168.92.61:8003/api/v1/open/auth/token "HTTP/1.1 200 OK"
+2026-05-28 08:29:33 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/projects?page=1&page_size=20 "HTTP/1.1 200 OK"
+INFO:     172.20.0.4:37134 - "GET /api/v1/annotation-platform/projects?page=1&page_size=20 HTTP/1.0" 200 OK
+2026-05-28 08:29:34 | INFO     | httpx | HTTP Request: POST http://192.168.92.61:8003/api/v1/open/projects/proj_2e8e2373469c/datasets/download "HTTP/1.1 200 OK"
+2026-05-28 08:29:34 | INFO     | peft-platform | Annotation export (completed_only=True): total_exported=4, file_url=/api/v1/open/datasets/downloads/dl_5078fe3a1cb6, file_name=proj_2e8e2373469c_alpaca_20260528_082934.json
+2026-05-28 08:29:34 | INFO     | httpx | HTTP Request: GET http://192.168.92.61:8003/api/v1/open/datasets/downloads/dl_5078fe3a1cb6 "HTTP/1.1 200 OK"
+2026-05-28 08:29:35 | INFO     | peft-platform | Downloaded annotation file: 2 bytes, content_type=application/json, url=http://192.168.92.61:8003/api/v1/open/datasets/downloads/dl_5078fe3a1cb6, redirects=0
+2026-05-28 08:29:35 | WARNING  | peft-platform | Annotation file content suspiciously small: b'[]'
+2026-05-28 08:29:35 | INFO     | peft-platform | Annotation file converted: 计算机实体标注_.jsonl, record_count=0
+2026-05-28 08:29:35 | INFO     | peft-platform | Imported dataset from annotation platform: proj_2e8e2373469c -> 计算机实体标注_.jsonl (0 records)
+INFO:     172.20.0.4:41034 - "POST /api/v1/annotation-platform/projects/proj_2e8e2373469c/import?project_name=%E8%AE%A1%E7%AE%97%E6%9C%BA%E5%AE%9E%E4%BD%93%E6%A0%87%E6%B3%A8%20&format=alpaca HTTP/1.0" 200 OK
+INFO:     172.20.0.4:41044 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK