scrape_model_info.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. #!/usr/bin/env python3
  2. """
  3. scrape_model_info.py
  4. 抓取阿里云百炼模型页面的基本信息和能力:
  5. - 模型 Code(model_code)
  6. - 模型名称(name)
  7. - 模型描述(description)
  8. - 能力标签(capabilities):如 文本生成、深度思考
  9. - 功能特性(features):如 function calling、联网搜索
  10. - 输入/输出模态(input_modalities / output_modalities)
  11. 原理:拦截页面请求的后端 API(listFoundationModels 等),
  12. 直接从 JSON 响应中提取,比解析 HTML 更准确。
  13. """
  14. import re
  15. import time
  16. import json
  17. from typing import Any, Dict, List, Optional
  18. from playwright.sync_api import Page, TimeoutError as PlaywrightTimeoutError
  19. # 能力标签映射
  20. CAPABILITY_LABELS: Dict[str, str] = {
  21. "TG": "文本生成",
  22. "QwQ": "深度思考",
  23. "Reasoning": "深度思考",
  24. "VU": "视觉理解",
  25. "AU": "音频理解",
  26. "VID": "视频理解",
  27. "VG": "视频生成",
  28. "IMG": "图像生成",
  29. "IG": "图像生成",
  30. "EMB": "向量表示",
  31. "ASR": "语音识别",
  32. "TTS": "语音合成",
  33. }
  34. # 页面上固定展示的功能项(按截图顺序:左列从上到下,右列从上到下)
  35. # key = API 返回的 feature 字符串,value = 页面显示名
  36. FEATURE_LABELS: Dict[str, str] = {
  37. "model-experience": "模型体验",
  38. "function-calling": "function calling",
  39. "structured-outputs": "结构化输出",
  40. "web-search": "联网搜索",
  41. "prefix-completion": "前缀续写",
  42. "cache": "cache存储",
  43. "batch": "批量推理",
  44. "model-optimization": "模型调优",
  45. }
  46. # 页面能力区域所有功能项的固定顺序(与截图一致)
  47. ALL_FEATURES_ORDERED: List[str] = [
  48. "model-experience", # 模型体验
  49. "function-calling", # function calling
  50. "structured-outputs", # 结构化输出
  51. "web-search", # 联网搜索
  52. "prefix-completion", # 前缀续写
  53. "cache", # cache存储
  54. "batch", # 批量推理
  55. "model-optimization", # 模型调优
  56. ]
  57. # 需要拦截的 API URL 关键词
  58. API_URL_RE = re.compile(
  59. r"listFoundationModels|listRecommendedModels|listFeaturedModels|getModelDetail|modelCenter",
  60. re.I,
  61. )
  62. def _extract_model_id_from_url(url: str) -> str:
  63. """从页面 URL 的 hash 部分提取模型 ID,如 #/model-market/detail/qwen3-max -> qwen3-max。"""
  64. # 优先从 hash 提取
  65. hash_match = re.search(r"#.*?/detail/([^/?#&]+)", url)
  66. if hash_match:
  67. return hash_match.group(1).strip()
  68. # 回退:取最后一段路径
  69. clean = re.sub(r"[?#].*", "", url)
  70. parts = [p for p in clean.rstrip("/").split("/") if p]
  71. return parts[-1] if parts else ""
  72. def _merge_with_items(obj: Dict) -> Dict:
  73. """
  74. 如果对象是"组"(group=true 或 model 以 group- 开头),
  75. 用其 items[0] 的数据补充缺失的 features / modelInfo / inferenceMetadata。
  76. """
  77. items = obj.get("items", [])
  78. if not items or not isinstance(items, list):
  79. return obj
  80. # 合并:组对象字段优先,子模型补充缺失字段
  81. merged = dict(obj)
  82. child = items[0] if isinstance(items[0], dict) else {}
  83. for key in ("features", "modelInfo", "inferenceMetadata", "capabilities"):
  84. # 只在组对象该字段为空时才用子模型的值补充
  85. group_val = merged.get(key)
  86. child_val = child.get(key)
  87. if not group_val and child_val:
  88. merged[key] = child_val
  89. # description 优先用组对象的,若为空则用子模型的
  90. if not merged.get("description") and child.get("description"):
  91. merged["description"] = child["description"]
  92. return merged
  93. def _find_model_in_json(data: Any, target: str) -> Optional[Dict]:
  94. """
  95. 递归在 JSON 数据中查找与 target 匹配的模型对象。
  96. 匹配规则:model 或 name 字段去掉 group- 前缀后与 target 完全相等(优先),
  97. 或 target 是 model_val 的完整前缀(如 qwen3-max 匹配 qwen3-max-0919)。
  98. 找到后自动用 items[0] 补充缺失字段。
  99. """
  100. clean_target = re.sub(r"^group-", "", target.lower())
  101. if isinstance(data, dict):
  102. model_val = re.sub(r"^group-", "", str(data.get("model", "")).lower())
  103. name_val = str(data.get("name", "")).lower()
  104. # 精确匹配 model 字段
  105. is_match = (model_val == clean_target)
  106. if is_match and ("model" in data or "name" in data):
  107. return _merge_with_items(data)
  108. for v in data.values():
  109. found = _find_model_in_json(v, target)
  110. if found:
  111. return found
  112. elif isinstance(data, list):
  113. for item in data:
  114. found = _find_model_in_json(item, target)
  115. if found:
  116. return found
  117. return None
  118. def parse_model_info(model_obj: Dict) -> Dict:
  119. """
  120. 将 API 返回的模型对象解析为结构化信息,字段顺序与页面一致:
  121. model_code -> display_tags -> description -> input_modalities
  122. -> output_modalities -> features(固定8项,true/false)
  123. """
  124. info: Dict = {}
  125. # ── 模型 Code ──
  126. info["model_code"] = re.sub(r"^group-", "", model_obj.get("model", ""))
  127. # ── 模型介绍标签(页面红框第一行:Qwen3 · 文本生成 · 深度思考) ──
  128. display_tags: List[str] = []
  129. ct = model_obj.get("collectionTag", "")
  130. if ct:
  131. display_tags.append(re.sub(r"^qwen", "Qwen", ct, flags=re.I))
  132. caps_raw: List[str] = model_obj.get("capabilities", [])
  133. for c in caps_raw:
  134. label = CAPABILITY_LABELS.get(c, c)
  135. if label not in display_tags:
  136. display_tags.append(label)
  137. info["display_tags"] = display_tags
  138. # ── 模型描述 ──
  139. info["description"] = (
  140. model_obj.get("description", "")
  141. or model_obj.get("shortDescription", "")
  142. )
  143. # ── 输入/输出模态 ──
  144. meta = model_obj.get("inferenceMetadata", {})
  145. info["input_modalities"] = meta.get("request_modality", [])
  146. info["output_modalities"] = meta.get("response_modality", [])
  147. # ── 模型能力:固定8项,true/false ──
  148. features_raw: List[str] = model_obj.get("features", [])
  149. features_set = set(features_raw)
  150. info["features"] = {
  151. FEATURE_LABELS[key]: (key in features_set)
  152. for key in ALL_FEATURES_ORDERED
  153. }
  154. return info
  155. def scrape_model_info(page: Page, url: str) -> Dict:
  156. """
  157. 在已打开的 Playwright page 上抓取模型基本信息。
  158. page 应已导航到目标 URL 并完成渲染。
  159. 返回:
  160. {
  161. "model_code": str,
  162. "name": str,
  163. "description": str,
  164. "provider": str,
  165. "collection_tag": str,
  166. "updated_at": str,
  167. "open_source": bool,
  168. "capabilities": [...],
  169. "features": [...],
  170. "input_modalities": [...],
  171. "output_modalities": [...],
  172. "error": str | None
  173. }
  174. """
  175. target = _extract_model_id_from_url(url)
  176. result: Dict = {"model_code": target, "error": None}
  177. # 从已捕获的 API 响应中查找(需要在导航前注册监听器)
  178. # 这里提供一个独立运行版本,重新导航并拦截
  179. api_data: List[Dict] = []
  180. def on_response(resp):
  181. try:
  182. if "application/json" not in resp.headers.get("content-type", ""):
  183. return
  184. if not API_URL_RE.search(resp.url):
  185. return
  186. try:
  187. api_data.append(resp.json())
  188. except Exception:
  189. pass
  190. except Exception:
  191. pass
  192. page.on("response", on_response)
  193. # 等待 API 响应(页面可能已加载,这里等待一小段确保响应被捕获)
  194. time.sleep(0.5)
  195. # 在已有 API 数据中查找
  196. model_obj = None
  197. for body in api_data:
  198. found = _find_model_in_json(body, target)
  199. if found:
  200. model_obj = found
  201. break
  202. if model_obj:
  203. result.update(parse_model_info(model_obj))
  204. else:
  205. result["error"] = f"未从 API 响应中找到模型 '{target}',可能需要登录或模型 ID 不匹配"
  206. return result
  207. # ── 独立运行入口 ────────────────────────────────────────────────────────────────
  208. def scrape_model_info_standalone(
  209. url: str,
  210. headless: bool = True,
  211. timeout: int = 20000,
  212. executable_path: Optional[str] = None,
  213. ) -> Dict:
  214. """独立运行:自己启动浏览器,导航,抓取模型信息后关闭。"""
  215. from playwright.sync_api import sync_playwright
  216. target = _extract_model_id_from_url(url)
  217. result: Dict = {"url": url, "model_code": target, "error": None}
  218. api_data: List[Dict] = []
  219. with sync_playwright() as p:
  220. launch_kwargs: Dict = {"headless": headless}
  221. if executable_path:
  222. launch_kwargs["executable_path"] = executable_path
  223. browser = p.chromium.launch(**launch_kwargs)
  224. page = browser.new_context().new_page()
  225. def on_response(resp):
  226. try:
  227. if "application/json" not in resp.headers.get("content-type", ""):
  228. return
  229. if not API_URL_RE.search(resp.url):
  230. return
  231. try:
  232. api_data.append(resp.json())
  233. except Exception:
  234. pass
  235. except Exception:
  236. pass
  237. page.on("response", on_response)
  238. try:
  239. page.goto(url, wait_until="networkidle", timeout=timeout)
  240. except PlaywrightTimeoutError:
  241. try:
  242. page.goto(url, wait_until="load", timeout=timeout)
  243. except Exception as e:
  244. result["error"] = f"导航失败: {e}"
  245. browser.close()
  246. return result
  247. # 等待页面内容
  248. for sel in ["text=模型介绍", "text=模型能力", "text=模型价格"]:
  249. try:
  250. page.wait_for_selector(sel, timeout=6000)
  251. break
  252. except PlaywrightTimeoutError:
  253. pass
  254. time.sleep(1.0)
  255. model_obj = None
  256. for body in api_data:
  257. found = _find_model_in_json(body, target)
  258. if found:
  259. model_obj = found
  260. break
  261. if model_obj:
  262. result.update(parse_model_info(model_obj))
  263. else:
  264. result["error"] = f"未从 API 响应中找到模型 '{target}'"
  265. browser.close()
  266. return result
  267. if __name__ == "__main__":
  268. import argparse, os
  269. ap = argparse.ArgumentParser(description="抓取阿里云模型基本信息与能力")
  270. group = ap.add_mutually_exclusive_group(required=True)
  271. group.add_argument("--url", help="模型页面 URL")
  272. group.add_argument("--file", help="URL 列表文件(每行一个)")
  273. ap.add_argument("--headful", action="store_true")
  274. ap.add_argument("--timeout", type=int, default=20000)
  275. ap.add_argument("--browser-path")
  276. args = ap.parse_args()
  277. urls = [args.url] if args.url else open(args.file, encoding="utf-8").read().splitlines()
  278. urls = [u.strip() for u in urls if u.strip()]
  279. exec_path = args.browser_path or os.environ.get("PLAYWRIGHT_EXECUTABLE")
  280. headless = not args.headful
  281. results = []
  282. for u in urls:
  283. print(f"抓取模型信息: {u}", flush=True)
  284. results.append(scrape_model_info_standalone(u, headless=headless, timeout=args.timeout, executable_path=exec_path))
  285. print(json.dumps(results, ensure_ascii=False, indent=2))