|
|
@@ -219,101 +219,103 @@ async def import_project_dataset(
|
|
|
3. 保存到 uploads 目录
|
|
|
4. 写入 DatasetRecord 数据库
|
|
|
"""
|
|
|
- # 1. 请求导出(先尝试只导出已完成,若为空则回退导出全部)
|
|
|
- export_data = await _request(
|
|
|
- "POST",
|
|
|
- f"/api/v1/open/projects/{project_id}/datasets/download",
|
|
|
- json={"format": format, "completed_only": True},
|
|
|
- )
|
|
|
-
|
|
|
- file_url = export_data.get("file_url", "")
|
|
|
- file_name = export_data.get("file_name", f"{project_id}_{format}.json")
|
|
|
- total_exported = export_data.get("total_exported", 0)
|
|
|
+ # 尝试多种格式导出,某些项目可能不兼容 alpaca 格式
|
|
|
+ formats_to_try = [format]
|
|
|
+ if format != "json":
|
|
|
+ formats_to_try.append("json")
|
|
|
+ if format not in ("raw", "original"):
|
|
|
+ formats_to_try.append("raw")
|
|
|
|
|
|
- logger.info(
|
|
|
- f"Annotation export (completed_only=True): total_exported={total_exported}, "
|
|
|
- f"file_url={file_url}, file_name={file_name}"
|
|
|
- )
|
|
|
+ file_content = b""
|
|
|
+ file_name = ""
|
|
|
+ total_exported = 0
|
|
|
+ used_format = ""
|
|
|
|
|
|
- # 如果已完成数据为空,回退导出全部(包括未完成的)
|
|
|
- if total_exported == 0 or not file_url:
|
|
|
- logger.info(f"No completed items found, retrying with completed_only=False")
|
|
|
+ for try_format in formats_to_try:
|
|
|
+ # 1. 请求导出
|
|
|
export_data = await _request(
|
|
|
"POST",
|
|
|
f"/api/v1/open/projects/{project_id}/datasets/download",
|
|
|
- json={"format": format, "completed_only": False},
|
|
|
+ json={"format": try_format, "completed_only": True},
|
|
|
)
|
|
|
- file_url = export_data.get("file_url", "")
|
|
|
- file_name = export_data.get("file_name", f"{project_id}_{format}.json")
|
|
|
- total_exported = export_data.get("total_exported", 0)
|
|
|
+
|
|
|
logger.info(
|
|
|
- f"Annotation export (completed_only=False): total_exported={total_exported}, "
|
|
|
- f"file_url={file_url}, file_name={file_name}"
|
|
|
+ f"Annotation export response (format={try_format}): {export_data}"
|
|
|
)
|
|
|
|
|
|
- if not file_url:
|
|
|
- raise RuntimeError("标注平台未返回下载链接")
|
|
|
+ file_url = export_data.get("file_url", "")
|
|
|
+ file_name = export_data.get("file_name", f"{project_id}_{try_format}.json")
|
|
|
+ total_exported = export_data.get("total_exported", 0)
|
|
|
|
|
|
- # 2. 从 file_url 中提取 download_token
|
|
|
- # file_url 格式如: /api/v1/open/datasets/downloads/dl_abc123
|
|
|
- if "/datasets/downloads/" in file_url:
|
|
|
- download_token = file_url.split("/datasets/downloads/")[-1].strip("/")
|
|
|
- else:
|
|
|
- # 兜底:直接使用 file_url 的最后一段
|
|
|
- download_token = file_url.rstrip("/").split("/")[-1]
|
|
|
+ if not file_url:
|
|
|
+ logger.warning(f"No file_url for format={try_format}, trying next...")
|
|
|
+ continue
|
|
|
|
|
|
- # 3. 通过独立的下载接口获取文件(文档 4.6 节)
|
|
|
- # 标注平台导出是异步的,文件可能还没生成,需要轮询
|
|
|
- await get_token()
|
|
|
- base_url = _get_base_url()
|
|
|
- download_url = f"{base_url}/api/v1/open/datasets/downloads/{download_token}"
|
|
|
+ # 2. 从 file_url 中提取 download_token
|
|
|
+ if "/datasets/downloads/" in file_url:
|
|
|
+ download_token = file_url.split("/datasets/downloads/")[-1].strip("/")
|
|
|
+ else:
|
|
|
+ download_token = file_url.rstrip("/").split("/")[-1]
|
|
|
|
|
|
- file_content = b""
|
|
|
- max_retries = 6
|
|
|
- for attempt in range(max_retries):
|
|
|
- async with httpx.AsyncClient(timeout=120) as client:
|
|
|
- resp = await client.get(
|
|
|
- download_url,
|
|
|
- headers=_auth_headers(),
|
|
|
- follow_redirects=False,
|
|
|
- )
|
|
|
- # 手动跟随重定向,每次都带上认证头
|
|
|
- redirect_count = 0
|
|
|
- while resp.is_redirect and redirect_count < 5:
|
|
|
- redirect_url = resp.next_request.url
|
|
|
- logger.info(f"Download redirect to: {redirect_url}")
|
|
|
+ # 3. 下载文件,带轮询(标注平台生成文件可能需要时间)
|
|
|
+ await get_token()
|
|
|
+ base_url = _get_base_url()
|
|
|
+ download_url = f"{base_url}/api/v1/open/datasets/downloads/{download_token}"
|
|
|
+
|
|
|
+ file_content = b""
|
|
|
+ max_retries = 4
|
|
|
+ for attempt in range(max_retries):
|
|
|
+ async with httpx.AsyncClient(timeout=120) as client:
|
|
|
resp = await client.get(
|
|
|
- str(redirect_url),
|
|
|
+ download_url,
|
|
|
headers=_auth_headers(),
|
|
|
follow_redirects=False,
|
|
|
)
|
|
|
- redirect_count += 1
|
|
|
- resp.raise_for_status()
|
|
|
- file_content = resp.content
|
|
|
+ redirect_count = 0
|
|
|
+ while resp.is_redirect and redirect_count < 5:
|
|
|
+ redirect_url = resp.next_request.url
|
|
|
+ logger.info(f"Download redirect to: {redirect_url}")
|
|
|
+ resp = await client.get(
|
|
|
+ str(redirect_url),
|
|
|
+ headers=_auth_headers(),
|
|
|
+ follow_redirects=False,
|
|
|
+ )
|
|
|
+ redirect_count += 1
|
|
|
+ resp.raise_for_status()
|
|
|
+ file_content = resp.content
|
|
|
+
|
|
|
+ if len(file_content) > 10:
|
|
|
+ break
|
|
|
+
|
|
|
+ if attempt < max_retries - 1:
|
|
|
+ import asyncio
|
|
|
+ wait = 2 ** attempt # 1, 2, 4 秒
|
|
|
+ logger.info(
|
|
|
+ f"Download attempt {attempt + 1}/{max_retries} (format={try_format}): "
|
|
|
+ f"file too small ({len(file_content)} bytes), retrying in {wait}s..."
|
|
|
+ )
|
|
|
+ await asyncio.sleep(wait)
|
|
|
+
|
|
|
+ logger.info(
|
|
|
+ f"Downloaded (format={try_format}): {len(file_content)} bytes, "
|
|
|
+ f"content_type={resp.headers.get('content-type', 'unknown')}"
|
|
|
+ )
|
|
|
|
|
|
- # 文件内容足够大,说明下载成功
|
|
|
if len(file_content) > 10:
|
|
|
+ used_format = try_format
|
|
|
break
|
|
|
|
|
|
- # 内容为空或太小(如 [] 或 {}),文件可能还没生成
|
|
|
- if attempt < max_retries - 1:
|
|
|
- wait = 2 ** attempt # 1, 2, 4, 8, 16 秒
|
|
|
- logger.info(
|
|
|
- f"Download attempt {attempt + 1}/{max_retries}: "
|
|
|
- f"file too small ({len(file_content)} bytes), retrying in {wait}s..."
|
|
|
- )
|
|
|
- import asyncio
|
|
|
- await asyncio.sleep(wait)
|
|
|
- else:
|
|
|
- logger.warning(f"Download failed after {max_retries} attempts, file still empty")
|
|
|
+ logger.warning(
|
|
|
+ f"Format '{try_format}' returned empty file ({len(file_content)} bytes), "
|
|
|
+ f"trying next format..."
|
|
|
+ )
|
|
|
|
|
|
- logger.info(
|
|
|
- f"Downloaded annotation file: {len(file_content)} bytes, "
|
|
|
- f"content_type={resp.headers.get('content-type', 'unknown')}, "
|
|
|
- f"redirects={redirect_count}"
|
|
|
- )
|
|
|
- if len(file_content) < 200:
|
|
|
- logger.warning(f"Annotation file content suspiciously small: {file_content!r}")
|
|
|
+ if len(file_content) <= 10:
|
|
|
+ logger.warning(f"Annotation file content: {file_content!r}")
|
|
|
+ raise RuntimeError(
|
|
|
+ f"标注平台导出文件为空(尝试了格式: {formats_to_try}),"
|
|
|
+ f"total_exported={total_exported},请检查标注平台项目状态"
|
|
|
+ )
|
|
|
|
|
|
# 4. 保存到 uploads 目录
|
|
|
upload_dir = settings.uploads_dir
|
|
|
@@ -326,11 +328,11 @@ async def import_project_dataset(
|
|
|
|
|
|
file_path.write_bytes(file_content)
|
|
|
|
|
|
- # 5. 统一转为 JSONL 格式(和 ModelScope/HF 下载的数据格式一致)
|
|
|
+ # 5. 统一转为 JSONL 格式
|
|
|
jsonl_path = _convert_to_jsonl(file_path)
|
|
|
record_count = _count_records(jsonl_path, "jsonl")
|
|
|
|
|
|
- logger.info(f"Annotation file converted: {jsonl_path.name}, record_count={record_count}")
|
|
|
+ logger.info(f"Annotation file converted: {jsonl_path.name}, record_count={record_count}, format={used_format}")
|
|
|
|
|
|
# 6. 写入数据库(格式统一为 jsonl)
|
|
|
record_id = str(uuid.uuid4())
|