| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342 |
- #!/usr/bin/env python3
- """
- scrape_model_info.py
- 抓取阿里云百炼模型页面的基本信息和能力:
- - 模型 Code(model_code)
- - 模型名称(name)
- - 模型描述(description)
- - 能力标签(capabilities):如 文本生成、深度思考
- - 功能特性(features):如 function calling、联网搜索
- - 输入/输出模态(input_modalities / output_modalities)
- 原理:拦截页面请求的后端 API(listFoundationModels 等),
- 直接从 JSON 响应中提取,比解析 HTML 更准确。
- """
- import re
- import time
- import json
- from typing import Any, Dict, List, Optional
- from playwright.sync_api import Page, TimeoutError as PlaywrightTimeoutError
- # 能力标签映射
- CAPABILITY_LABELS: Dict[str, str] = {
- "TG": "文本生成",
- "QwQ": "深度思考",
- "Reasoning": "深度思考",
- "VU": "视觉理解",
- "AU": "音频理解",
- "VID": "视频理解",
- "VG": "视频生成",
- "IMG": "图像生成",
- "IG": "图像生成",
- "EMB": "向量表示",
- "ASR": "语音识别",
- "TTS": "语音合成",
- }
- # 页面上固定展示的功能项(按截图顺序:左列从上到下,右列从上到下)
- # key = API 返回的 feature 字符串,value = 页面显示名
- FEATURE_LABELS: Dict[str, str] = {
- "model-experience": "模型体验",
- "function-calling": "function calling",
- "structured-outputs": "结构化输出",
- "web-search": "联网搜索",
- "prefix-completion": "前缀续写",
- "cache": "cache存储",
- "batch": "批量推理",
- "model-optimization": "模型调优",
- }
- # 页面能力区域所有功能项的固定顺序(与截图一致)
- ALL_FEATURES_ORDERED: List[str] = [
- "model-experience", # 模型体验
- "function-calling", # function calling
- "structured-outputs", # 结构化输出
- "web-search", # 联网搜索
- "prefix-completion", # 前缀续写
- "cache", # cache存储
- "batch", # 批量推理
- "model-optimization", # 模型调优
- ]
- # 需要拦截的 API URL 关键词
- API_URL_RE = re.compile(
- r"listFoundationModels|listRecommendedModels|listFeaturedModels|getModelDetail|modelCenter",
- re.I,
- )
- def _extract_model_id_from_url(url: str) -> str:
- """从页面 URL 的 hash 部分提取模型 ID,如 #/model-market/detail/qwen3-max -> qwen3-max。"""
- # 优先从 hash 提取
- hash_match = re.search(r"#.*?/detail/([^/?#&]+)", url)
- if hash_match:
- return hash_match.group(1).strip()
- # 回退:取最后一段路径
- clean = re.sub(r"[?#].*", "", url)
- parts = [p for p in clean.rstrip("/").split("/") if p]
- return parts[-1] if parts else ""
- def _merge_with_items(obj: Dict) -> Dict:
- """
- 如果对象是"组"(group=true 或 model 以 group- 开头),
- 用其 items[0] 的数据补充缺失的 features / modelInfo / inferenceMetadata。
- """
- items = obj.get("items", [])
- if not items or not isinstance(items, list):
- return obj
- # 合并:组对象字段优先,子模型补充缺失字段
- merged = dict(obj)
- child = items[0] if isinstance(items[0], dict) else {}
- for key in ("features", "modelInfo", "inferenceMetadata", "capabilities"):
- # 只在组对象该字段为空时才用子模型的值补充
- group_val = merged.get(key)
- child_val = child.get(key)
- if not group_val and child_val:
- merged[key] = child_val
- # description 优先用组对象的,若为空则用子模型的
- if not merged.get("description") and child.get("description"):
- merged["description"] = child["description"]
- return merged
- def _find_model_in_json(data: Any, target: str) -> Optional[Dict]:
- """
- 递归在 JSON 数据中查找与 target 匹配的模型对象。
- 匹配规则:model 或 name 字段去掉 group- 前缀后与 target 完全相等(优先),
- 或 target 是 model_val 的完整前缀(如 qwen3-max 匹配 qwen3-max-0919)。
- 找到后自动用 items[0] 补充缺失字段。
- """
- clean_target = re.sub(r"^group-", "", target.lower())
- if isinstance(data, dict):
- model_val = re.sub(r"^group-", "", str(data.get("model", "")).lower())
- name_val = str(data.get("name", "")).lower()
- # 精确匹配 model 字段
- is_match = (model_val == clean_target)
- if is_match and ("model" in data or "name" in data):
- return _merge_with_items(data)
- for v in data.values():
- found = _find_model_in_json(v, target)
- if found:
- return found
- elif isinstance(data, list):
- for item in data:
- found = _find_model_in_json(item, target)
- if found:
- return found
- return None
- def parse_model_info(model_obj: Dict) -> Dict:
- """
- 将 API 返回的模型对象解析为结构化信息,字段顺序与页面一致:
- model_code -> display_tags -> description -> input_modalities
- -> output_modalities -> features(固定8项,true/false)
- """
- info: Dict = {}
- # ── 模型 Code ──
- info["model_code"] = re.sub(r"^group-", "", model_obj.get("model", ""))
- # ── 模型介绍标签(页面红框第一行:Qwen3 · 文本生成 · 深度思考) ──
- display_tags: List[str] = []
- ct = model_obj.get("collectionTag", "")
- if ct:
- display_tags.append(re.sub(r"^qwen", "Qwen", ct, flags=re.I))
- caps_raw: List[str] = model_obj.get("capabilities", [])
- for c in caps_raw:
- label = CAPABILITY_LABELS.get(c, c)
- if label not in display_tags:
- display_tags.append(label)
- info["display_tags"] = display_tags
- # ── 模型描述 ──
- info["description"] = (
- model_obj.get("description", "")
- or model_obj.get("shortDescription", "")
- )
- # ── 输入/输出模态 ──
- meta = model_obj.get("inferenceMetadata", {})
- info["input_modalities"] = meta.get("request_modality", [])
- info["output_modalities"] = meta.get("response_modality", [])
- # ── 模型能力:固定8项,true/false ──
- features_raw: List[str] = model_obj.get("features", [])
- features_set = set(features_raw)
- info["features"] = {
- FEATURE_LABELS[key]: (key in features_set)
- for key in ALL_FEATURES_ORDERED
- }
- return info
- def scrape_model_info(page: Page, url: str) -> Dict:
- """
- 在已打开的 Playwright page 上抓取模型基本信息。
- page 应已导航到目标 URL 并完成渲染。
- 返回:
- {
- "model_code": str,
- "name": str,
- "description": str,
- "provider": str,
- "collection_tag": str,
- "updated_at": str,
- "open_source": bool,
- "capabilities": [...],
- "features": [...],
- "input_modalities": [...],
- "output_modalities": [...],
- "error": str | None
- }
- """
- target = _extract_model_id_from_url(url)
- result: Dict = {"model_code": target, "error": None}
- # 从已捕获的 API 响应中查找(需要在导航前注册监听器)
- # 这里提供一个独立运行版本,重新导航并拦截
- api_data: List[Dict] = []
- def on_response(resp):
- try:
- if "application/json" not in resp.headers.get("content-type", ""):
- return
- if not API_URL_RE.search(resp.url):
- return
- try:
- api_data.append(resp.json())
- except Exception:
- pass
- except Exception:
- pass
- page.on("response", on_response)
- # 等待 API 响应(页面可能已加载,这里等待一小段确保响应被捕获)
- time.sleep(0.5)
- # 在已有 API 数据中查找
- model_obj = None
- for body in api_data:
- found = _find_model_in_json(body, target)
- if found:
- model_obj = found
- break
- if model_obj:
- result.update(parse_model_info(model_obj))
- else:
- result["error"] = f"未从 API 响应中找到模型 '{target}',可能需要登录或模型 ID 不匹配"
- return result
- # ── 独立运行入口 ────────────────────────────────────────────────────────────────
- def scrape_model_info_standalone(
- url: str,
- headless: bool = True,
- timeout: int = 20000,
- executable_path: Optional[str] = None,
- ) -> Dict:
- """独立运行:自己启动浏览器,导航,抓取模型信息后关闭。"""
- from playwright.sync_api import sync_playwright
- target = _extract_model_id_from_url(url)
- result: Dict = {"url": url, "model_code": target, "error": None}
- api_data: List[Dict] = []
- with sync_playwright() as p:
- launch_kwargs: Dict = {"headless": headless}
- if executable_path:
- launch_kwargs["executable_path"] = executable_path
- browser = p.chromium.launch(**launch_kwargs)
- page = browser.new_context().new_page()
- def on_response(resp):
- try:
- if "application/json" not in resp.headers.get("content-type", ""):
- return
- if not API_URL_RE.search(resp.url):
- return
- try:
- api_data.append(resp.json())
- except Exception:
- pass
- except Exception:
- pass
- page.on("response", on_response)
- try:
- page.goto(url, wait_until="networkidle", timeout=timeout)
- except PlaywrightTimeoutError:
- try:
- page.goto(url, wait_until="load", timeout=timeout)
- except Exception as e:
- result["error"] = f"导航失败: {e}"
- browser.close()
- return result
- # 等待页面内容
- for sel in ["text=模型介绍", "text=模型能力", "text=模型价格"]:
- try:
- page.wait_for_selector(sel, timeout=6000)
- break
- except PlaywrightTimeoutError:
- pass
- time.sleep(1.0)
- model_obj = None
- for body in api_data:
- found = _find_model_in_json(body, target)
- if found:
- model_obj = found
- break
- if model_obj:
- result.update(parse_model_info(model_obj))
- else:
- result["error"] = f"未从 API 响应中找到模型 '{target}'"
- browser.close()
- return result
- if __name__ == "__main__":
- import argparse, os
- ap = argparse.ArgumentParser(description="抓取阿里云模型基本信息与能力")
- group = ap.add_mutually_exclusive_group(required=True)
- group.add_argument("--url", help="模型页面 URL")
- group.add_argument("--file", help="URL 列表文件(每行一个)")
- ap.add_argument("--headful", action="store_true")
- ap.add_argument("--timeout", type=int, default=20000)
- ap.add_argument("--browser-path")
- args = ap.parse_args()
- urls = [args.url] if args.url else open(args.file, encoding="utf-8").read().splitlines()
- urls = [u.strip() for u in urls if u.strip()]
- exec_path = args.browser_path or os.environ.get("PLAYWRIGHT_EXECUTABLE")
- headless = not args.headful
- results = []
- for u in urls:
- print(f"抓取模型信息: {u}", flush=True)
- results.append(scrape_model_info_standalone(u, headless=headless, timeout=args.timeout, executable_path=exec_path))
- print(json.dumps(results, ensure_ascii=False, indent=2))
|