scrape_model_info.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. #!/usr/bin/env python3
  2. """
  3. scrape_model_info.py
  4. 抓取阿里云百炼模型页面的基本信息和能力:
  5. - 模型 Code(model_code)
  6. - 模型名称(name)
  7. - 模型描述(description)
  8. - 能力标签(capabilities):如 文本生成、深度思考
  9. - 功能特性(features):如 function calling、联网搜索
  10. - 输入/输出模态(input_modalities / output_modalities)
  11. 原理:拦截页面请求的后端 API(listFoundationModels 等),
  12. 直接从 JSON 响应中提取,比解析 HTML 更准确。
  13. """
  14. import re
  15. import time
  16. import json
  17. from typing import Any, Dict, List, Optional
  18. from playwright.sync_api import Page, TimeoutError as PlaywrightTimeoutError
  19. # 能力标签映射
  20. CAPABILITY_LABELS: Dict[str, str] = {
  21. "TG": "文本生成",
  22. "QwQ": "深度思考",
  23. "Reasoning": "深度思考",
  24. "VU": "视觉理解",
  25. "AU": "音频理解",
  26. "VID": "视频理解",
  27. "VG": "视频生成",
  28. "IMG": "图像生成",
  29. "IG": "图像生成",
  30. "EMB": "向量表示",
  31. "TR": "向量模型",
  32. "ME": "多模态向量",
  33. "ASR": "语音识别",
  34. "Realtime-ASR": "实时语音识别",
  35. "RealtimeASR": "实时语音识别",
  36. "TTS": "语音合成",
  37. }
  38. # 页面上固定展示的功能项(按截图顺序:左列从上到下,右列从上到下)
  39. # key = API 返回的 feature 字符串,value = 页面显示名
  40. FEATURE_LABELS: Dict[str, str] = {
  41. "model-experience": "模型体验",
  42. "function-calling": "function calling",
  43. "structured-outputs": "结构化输出",
  44. "web-search": "联网搜索",
  45. "prefix-completion": "前缀续写",
  46. "cache": "cache存储",
  47. "batch": "批量推理",
  48. "model-optimization": "模型调优",
  49. }
  50. # 页面能力区域所有功能项的固定顺序(与截图一致)
  51. ALL_FEATURES_ORDERED: List[str] = [
  52. "model-experience", # 模型体验
  53. "function-calling", # function calling
  54. "structured-outputs", # 结构化输出
  55. "web-search", # 联网搜索
  56. "prefix-completion", # 前缀续写
  57. "cache", # cache存储
  58. "batch", # 批量推理
  59. "model-optimization", # 模型调优
  60. ]
  61. # 需要拦截的 API URL 关键词
  62. API_URL_RE = re.compile(
  63. r"listFoundationModels|listRecommendedModels|listFeaturedModels|getModelDetail|modelCenter",
  64. re.I,
  65. )
  66. def _extract_model_id_from_url(url: str) -> str:
  67. """从页面 URL 的 hash 部分提取模型 ID,如 #/model-market/detail/qwen3-max -> qwen3-max。"""
  68. # 优先从 hash 提取
  69. hash_match = re.search(r"#.*?/detail/([^/?#&]+)", url)
  70. if hash_match:
  71. return hash_match.group(1).strip()
  72. # 回退:取最后一段路径
  73. clean = re.sub(r"[?#].*", "", url)
  74. parts = [p for p in clean.rstrip("/").split("/") if p]
  75. return parts[-1] if parts else ""
  76. def _merge_with_items(obj: Dict) -> Dict:
  77. """
  78. 如果对象是"组"(group=true 或 model 以 group- 开头),
  79. 用其 items[0] 的数据补充缺失的 features / modelInfo / inferenceMetadata。
  80. """
  81. items = obj.get("items", [])
  82. if not items or not isinstance(items, list):
  83. return obj
  84. # 合并:组对象字段优先,子模型补充缺失字段
  85. merged = dict(obj)
  86. child = items[0] if isinstance(items[0], dict) else {}
  87. for key in ("features", "modelInfo", "inferenceMetadata", "capabilities"):
  88. # 只在组对象该字段为空时才用子模型的值补充
  89. group_val = merged.get(key)
  90. child_val = child.get(key)
  91. if not group_val and child_val:
  92. merged[key] = child_val
  93. # description 优先用组对象的,若为空则用子模型的
  94. if not merged.get("description") and child.get("description"):
  95. merged["description"] = child["description"]
  96. return merged
  97. def _find_model_in_json(data: Any, target: str) -> Optional[Dict]:
  98. """
  99. 递归在 JSON 数据中查找与 target 匹配的模型对象。
  100. 匹配规则:model 或 name 字段去掉 group- 前缀后与 target 完全相等(优先),
  101. 或 target 是 model_val 的完整前缀(如 qwen3-max 匹配 qwen3-max-0919)。
  102. 找到后自动用 items[0] 补充缺失字段。
  103. """
  104. clean_target = re.sub(r"^group-", "", target.lower())
  105. if isinstance(data, dict):
  106. model_val = re.sub(r"^group-", "", str(data.get("model", "")).lower())
  107. name_val = str(data.get("name", "")).lower()
  108. # 精确匹配 model 字段
  109. is_match = (model_val == clean_target)
  110. if is_match and ("model" in data or "name" in data):
  111. return _merge_with_items(data)
  112. for v in data.values():
  113. found = _find_model_in_json(v, target)
  114. if found:
  115. return found
  116. elif isinstance(data, list):
  117. for item in data:
  118. found = _find_model_in_json(item, target)
  119. if found:
  120. return found
  121. return None
  122. def parse_model_info(model_obj: Dict) -> Dict:
  123. """
  124. 将 API 返回的模型对象解析为结构化信息,字段顺序与页面一致:
  125. model_code -> display_tags -> description -> input_modalities
  126. -> output_modalities -> features(固定8项,true/false)
  127. """
  128. info: Dict = {}
  129. # ── 模型 Code ──
  130. info["model_code"] = re.sub(r"^group-", "", model_obj.get("model", ""))
  131. # ── 模型介绍标签(页面红框第一行:Qwen3 · 文本生成 · 深度思考) ──
  132. display_tags: List[str] = []
  133. ct = model_obj.get("collectionTag", "")
  134. if ct:
  135. display_tags.append(re.sub(r"^qwen", "Qwen", ct, flags=re.I))
  136. # 某些标签有"更具体版本",若更具体版本存在则跳过基础版
  137. SUPERSEDED_BY: Dict[str, str] = {
  138. "语音识别": "实时语音识别",
  139. }
  140. caps_raw: List[str] = model_obj.get("capabilities", [])
  141. all_labels = [CAPABILITY_LABELS.get(c, c) for c in caps_raw]
  142. labels_set = set(all_labels)
  143. for label in all_labels:
  144. superseded_by = SUPERSEDED_BY.get(label)
  145. if superseded_by and superseded_by in labels_set:
  146. continue # 有更具体的版本,跳过
  147. if label not in display_tags:
  148. display_tags.append(label)
  149. info["display_tags"] = display_tags
  150. # ── 模型描述 ──
  151. info["description"] = (
  152. model_obj.get("description", "")
  153. or model_obj.get("shortDescription", "")
  154. )
  155. # ── 输入/输出模态 ──
  156. meta = model_obj.get("inferenceMetadata", {})
  157. info["input_modalities"] = meta.get("request_modality", [])
  158. info["output_modalities"] = meta.get("response_modality", [])
  159. # ── 模型能力:固定8项,true/false ──
  160. features_raw: List[str] = model_obj.get("features", [])
  161. features_set = set(features_raw)
  162. info["features"] = {
  163. FEATURE_LABELS[key]: (key in features_set)
  164. for key in ALL_FEATURES_ORDERED
  165. }
  166. return info
  167. def scrape_model_info(page: Page, url: str) -> Dict:
  168. """
  169. 在已打开的 Playwright page 上抓取模型基本信息。
  170. page 应已导航到目标 URL 并完成渲染。
  171. 返回:
  172. {
  173. "model_code": str,
  174. "name": str,
  175. "description": str,
  176. "provider": str,
  177. "collection_tag": str,
  178. "updated_at": str,
  179. "open_source": bool,
  180. "capabilities": [...],
  181. "features": [...],
  182. "input_modalities": [...],
  183. "output_modalities": [...],
  184. "error": str | None
  185. }
  186. """
  187. target = _extract_model_id_from_url(url)
  188. result: Dict = {"model_code": target, "error": None}
  189. # 从已捕获的 API 响应中查找(需要在导航前注册监听器)
  190. # 这里提供一个独立运行版本,重新导航并拦截
  191. api_data: List[Dict] = []
  192. def on_response(resp):
  193. try:
  194. if "application/json" not in resp.headers.get("content-type", ""):
  195. return
  196. if not API_URL_RE.search(resp.url):
  197. return
  198. try:
  199. api_data.append(resp.json())
  200. except Exception:
  201. pass
  202. except Exception:
  203. pass
  204. page.on("response", on_response)
  205. # 等待 API 响应(页面可能已加载,这里等待一小段确保响应被捕获)
  206. time.sleep(0.5)
  207. # 在已有 API 数据中查找
  208. model_obj = None
  209. for body in api_data:
  210. found = _find_model_in_json(body, target)
  211. if found:
  212. model_obj = found
  213. break
  214. if model_obj:
  215. result.update(parse_model_info(model_obj))
  216. else:
  217. result["error"] = f"未从 API 响应中找到模型 '{target}',可能需要登录或模型 ID 不匹配"
  218. return result
  219. # ── 独立运行入口 ────────────────────────────────────────────────────────────────
  220. def scrape_model_info_standalone(
  221. url: str,
  222. headless: bool = True,
  223. timeout: int = 20000,
  224. executable_path: Optional[str] = None,
  225. ) -> Dict:
  226. """独立运行:自己启动浏览器,导航,抓取模型信息后关闭。"""
  227. from playwright.sync_api import sync_playwright
  228. target = _extract_model_id_from_url(url)
  229. result: Dict = {"url": url, "model_code": target, "error": None}
  230. api_data: List[Dict] = []
  231. with sync_playwright() as p:
  232. launch_kwargs: Dict = {"headless": headless}
  233. if executable_path:
  234. launch_kwargs["executable_path"] = executable_path
  235. browser = p.chromium.launch(**launch_kwargs)
  236. page = browser.new_context().new_page()
  237. def on_response(resp):
  238. try:
  239. if "application/json" not in resp.headers.get("content-type", ""):
  240. return
  241. if not API_URL_RE.search(resp.url):
  242. return
  243. try:
  244. api_data.append(resp.json())
  245. except Exception:
  246. pass
  247. except Exception:
  248. pass
  249. page.on("response", on_response)
  250. try:
  251. page.goto(url, wait_until="networkidle", timeout=timeout)
  252. except PlaywrightTimeoutError:
  253. try:
  254. page.goto(url, wait_until="load", timeout=timeout)
  255. except Exception as e:
  256. result["error"] = f"导航失败: {e}"
  257. browser.close()
  258. return result
  259. # 等待页面内容
  260. for sel in ["text=模型介绍", "text=模型能力", "text=模型价格"]:
  261. try:
  262. page.wait_for_selector(sel, timeout=6000)
  263. break
  264. except PlaywrightTimeoutError:
  265. pass
  266. time.sleep(1.0)
  267. model_obj = None
  268. for body in api_data:
  269. found = _find_model_in_json(body, target)
  270. if found:
  271. model_obj = found
  272. break
  273. if model_obj:
  274. result.update(parse_model_info(model_obj))
  275. else:
  276. result["error"] = f"未从 API 响应中找到模型 '{target}'"
  277. browser.close()
  278. return result
  279. if __name__ == "__main__":
  280. import argparse, os
  281. ap = argparse.ArgumentParser(description="抓取阿里云模型基本信息与能力")
  282. group = ap.add_mutually_exclusive_group(required=True)
  283. group.add_argument("--url", help="模型页面 URL")
  284. group.add_argument("--file", help="URL 列表文件(每行一个)")
  285. ap.add_argument("--headful", action="store_true")
  286. ap.add_argument("--timeout", type=int, default=20000)
  287. ap.add_argument("--browser-path")
  288. args = ap.parse_args()
  289. urls = [args.url] if args.url else open(args.file, encoding="utf-8").read().splitlines()
  290. urls = [u.strip() for u in urls if u.strip()]
  291. exec_path = args.browser_path or os.environ.get("PLAYWRIGHT_EXECUTABLE")
  292. headless = not args.headful
  293. results = []
  294. for u in urls:
  295. print(f"抓取模型信息: {u}", flush=True)
  296. results.append(scrape_model_info_standalone(u, headless=headless, timeout=args.timeout, executable_path=exec_path))
  297. print(json.dumps(results, ensure_ascii=False, indent=2))