#!/usr/bin/env python3 """ scrape_model_info.py 抓取阿里云百炼模型页面的基本信息和能力: - 模型 Code(model_code) - 模型名称(name) - 模型描述(description) - 能力标签(capabilities):如 文本生成、深度思考 - 功能特性(features):如 function calling、联网搜索 - 输入/输出模态(input_modalities / output_modalities) 原理:拦截页面请求的后端 API(listFoundationModels 等), 直接从 JSON 响应中提取,比解析 HTML 更准确。 """ import re import time import json from typing import Any, Dict, List, Optional from playwright.sync_api import Page, TimeoutError as PlaywrightTimeoutError # 能力标签映射 CAPABILITY_LABELS: Dict[str, str] = { "TG": "文本生成", "QwQ": "深度思考", "Reasoning": "深度思考", "VU": "视觉理解", "AU": "音频理解", "VID": "视频理解", "VG": "视频生成", "IMG": "图像生成", "IG": "图像生成", "EMB": "向量表示", "TR": "向量模型", "ME": "多模态向量", "ASR": "语音识别", "Realtime-ASR": "实时语音识别", "RealtimeASR": "实时语音识别", "TTS": "语音合成", } # 页面上固定展示的功能项(按截图顺序:左列从上到下,右列从上到下) # key = API 返回的 feature 字符串,value = 页面显示名 FEATURE_LABELS: Dict[str, str] = { "model-experience": "模型体验", "function-calling": "function calling", "structured-outputs": "结构化输出", "web-search": "联网搜索", "prefix-completion": "前缀续写", "cache": "cache存储", "batch": "批量推理", "model-optimization": "模型调优", } # 页面能力区域所有功能项的固定顺序(与截图一致) ALL_FEATURES_ORDERED: List[str] = [ "model-experience", # 模型体验 "function-calling", # function calling "structured-outputs", # 结构化输出 "web-search", # 联网搜索 "prefix-completion", # 前缀续写 "cache", # cache存储 "batch", # 批量推理 "model-optimization", # 模型调优 ] # 需要拦截的 API URL 关键词 API_URL_RE = re.compile( r"listFoundationModels|listRecommendedModels|listFeaturedModels|getModelDetail|modelCenter", re.I, ) def _extract_model_id_from_url(url: str) -> str: """从页面 URL 的 hash 部分提取模型 ID,如 #/model-market/detail/qwen3-max -> qwen3-max。""" # 优先从 hash 提取 hash_match = re.search(r"#.*?/detail/([^/?#&]+)", url) if hash_match: return hash_match.group(1).strip() # 回退:取最后一段路径 clean = re.sub(r"[?#].*", "", url) parts = [p for p in clean.rstrip("/").split("/") if p] return parts[-1] if parts else "" def _merge_with_items(obj: Dict) -> Dict: """ 如果对象是"组"(group=true 或 model 以 group- 开头), 用其 items[0] 的数据补充缺失的 features / modelInfo / inferenceMetadata。 """ items = obj.get("items", []) if not items or not isinstance(items, list): return obj # 合并:组对象字段优先,子模型补充缺失字段 merged = dict(obj) child = items[0] if isinstance(items[0], dict) else {} for key in ("features", "modelInfo", "inferenceMetadata", "capabilities"): # 只在组对象该字段为空时才用子模型的值补充 group_val = merged.get(key) child_val = child.get(key) if not group_val and child_val: merged[key] = child_val # description 优先用组对象的,若为空则用子模型的 if not merged.get("description") and child.get("description"): merged["description"] = child["description"] return merged def _find_model_in_json(data: Any, target: str) -> Optional[Dict]: """ 递归在 JSON 数据中查找与 target 匹配的模型对象。 匹配规则:model 或 name 字段去掉 group- 前缀后与 target 完全相等(优先), 或 target 是 model_val 的完整前缀(如 qwen3-max 匹配 qwen3-max-0919)。 找到后自动用 items[0] 补充缺失字段。 """ clean_target = re.sub(r"^group-", "", target.lower()) if isinstance(data, dict): model_val = re.sub(r"^group-", "", str(data.get("model", "")).lower()) name_val = str(data.get("name", "")).lower() # 精确匹配 model 字段 is_match = (model_val == clean_target) if is_match and ("model" in data or "name" in data): return _merge_with_items(data) for v in data.values(): found = _find_model_in_json(v, target) if found: return found elif isinstance(data, list): for item in data: found = _find_model_in_json(item, target) if found: return found return None def parse_model_info(model_obj: Dict) -> Dict: """ 将 API 返回的模型对象解析为结构化信息,字段顺序与页面一致: model_code -> display_tags -> description -> input_modalities -> output_modalities -> features(固定8项,true/false) """ info: Dict = {} # ── 模型 Code ── info["model_code"] = re.sub(r"^group-", "", model_obj.get("model", "")) # ── 模型介绍标签(页面红框第一行:Qwen3 · 文本生成 · 深度思考) ── display_tags: List[str] = [] ct = model_obj.get("collectionTag", "") if ct: display_tags.append(re.sub(r"^qwen", "Qwen", ct, flags=re.I)) # 某些标签有"更具体版本",若更具体版本存在则跳过基础版 SUPERSEDED_BY: Dict[str, str] = { "语音识别": "实时语音识别", } caps_raw: List[str] = model_obj.get("capabilities", []) all_labels = [CAPABILITY_LABELS.get(c, c) for c in caps_raw] labels_set = set(all_labels) for label in all_labels: superseded_by = SUPERSEDED_BY.get(label) if superseded_by and superseded_by in labels_set: continue # 有更具体的版本,跳过 if label not in display_tags: display_tags.append(label) info["display_tags"] = display_tags # ── 模型描述 ── info["description"] = ( model_obj.get("description", "") or model_obj.get("shortDescription", "") ) # ── 输入/输出模态 ── meta = model_obj.get("inferenceMetadata", {}) info["input_modalities"] = meta.get("request_modality", []) info["output_modalities"] = meta.get("response_modality", []) # ── 模型能力:固定8项,true/false ── features_raw: List[str] = model_obj.get("features", []) features_set = set(features_raw) info["features"] = { FEATURE_LABELS[key]: (key in features_set) for key in ALL_FEATURES_ORDERED } return info def scrape_model_info(page: Page, url: str) -> Dict: """ 在已打开的 Playwright page 上抓取模型基本信息。 page 应已导航到目标 URL 并完成渲染。 返回: { "model_code": str, "name": str, "description": str, "provider": str, "collection_tag": str, "updated_at": str, "open_source": bool, "capabilities": [...], "features": [...], "input_modalities": [...], "output_modalities": [...], "error": str | None } """ target = _extract_model_id_from_url(url) result: Dict = {"model_code": target, "error": None} # 从已捕获的 API 响应中查找(需要在导航前注册监听器) # 这里提供一个独立运行版本,重新导航并拦截 api_data: List[Dict] = [] def on_response(resp): try: if "application/json" not in resp.headers.get("content-type", ""): return if not API_URL_RE.search(resp.url): return try: api_data.append(resp.json()) except Exception: pass except Exception: pass page.on("response", on_response) # 等待 API 响应(页面可能已加载,这里等待一小段确保响应被捕获) time.sleep(0.5) # 在已有 API 数据中查找 model_obj = None for body in api_data: found = _find_model_in_json(body, target) if found: model_obj = found break if model_obj: result.update(parse_model_info(model_obj)) else: result["error"] = f"未从 API 响应中找到模型 '{target}',可能需要登录或模型 ID 不匹配" return result # ── 独立运行入口 ──────────────────────────────────────────────────────────────── def scrape_model_info_standalone( url: str, headless: bool = True, timeout: int = 20000, executable_path: Optional[str] = None, ) -> Dict: """独立运行:自己启动浏览器,导航,抓取模型信息后关闭。""" from playwright.sync_api import sync_playwright target = _extract_model_id_from_url(url) result: Dict = {"url": url, "model_code": target, "error": None} api_data: List[Dict] = [] with sync_playwright() as p: launch_kwargs: Dict = {"headless": headless} if executable_path: launch_kwargs["executable_path"] = executable_path browser = p.chromium.launch(**launch_kwargs) page = browser.new_context().new_page() def on_response(resp): try: if "application/json" not in resp.headers.get("content-type", ""): return if not API_URL_RE.search(resp.url): return try: api_data.append(resp.json()) except Exception: pass except Exception: pass page.on("response", on_response) try: page.goto(url, wait_until="networkidle", timeout=timeout) except PlaywrightTimeoutError: try: page.goto(url, wait_until="load", timeout=timeout) except Exception as e: result["error"] = f"导航失败: {e}" browser.close() return result # 等待页面内容 for sel in ["text=模型介绍", "text=模型能力", "text=模型价格"]: try: page.wait_for_selector(sel, timeout=6000) break except PlaywrightTimeoutError: pass time.sleep(1.0) model_obj = None for body in api_data: found = _find_model_in_json(body, target) if found: model_obj = found break if model_obj: result.update(parse_model_info(model_obj)) else: result["error"] = f"未从 API 响应中找到模型 '{target}'" browser.close() return result if __name__ == "__main__": import argparse, os ap = argparse.ArgumentParser(description="抓取阿里云模型基本信息与能力") group = ap.add_mutually_exclusive_group(required=True) group.add_argument("--url", help="模型页面 URL") group.add_argument("--file", help="URL 列表文件(每行一个)") ap.add_argument("--headful", action="store_true") ap.add_argument("--timeout", type=int, default=20000) ap.add_argument("--browser-path") args = ap.parse_args() urls = [args.url] if args.url else open(args.file, encoding="utf-8").read().splitlines() urls = [u.strip() for u in urls if u.strip()] exec_path = args.browser_path or os.environ.get("PLAYWRIGHT_EXECUTABLE") headless = not args.headful results = [] for u in urls: print(f"抓取模型信息: {u}", flush=True) results.append(scrape_model_info_standalone(u, headless=headless, timeout=args.timeout, executable_path=exec_path)) print(json.dumps(results, ensure_ascii=False, indent=2))