scrape_model_icon.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. #!/usr/bin/env python3
  2. """
  3. Aliyun Model Icon Scraper
  4. 用 Playwright 渲染模型详情页,从 DOM 中提取模型图标(SVG 或 img)。
  5. 用法:
  6. python scrape_model_icon.py --url "https://bailian.console.aliyun.com/.../qwen3-max"
  7. python scrape_model_icon.py --url "..." --save-svg icons/qwen3-max.svg
  8. python scrape_model_icon.py --url "..." --screenshot icons/qwen3-max.png
  9. """
  10. import argparse
  11. import json
  12. import os
  13. import re
  14. import time
  15. from typing import Optional, Dict
  16. from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
  17. # 按优先级依次尝试的选择器
  18. # 卡片大图标区域(模型详情页左上角)
  19. _ICON_SELECTORS = [
  20. # 模型详情页 header 里的图标容器
  21. '[class*="modelIcon"] svg',
  22. '[class*="modelIcon"] img',
  23. '[class*="model-icon"] svg',
  24. '[class*="model-icon"] img',
  25. # 面包屑里的小图标(备用)
  26. '[class*="currentModelIcon"]',
  27. # 通用:页面 header 区域第一个 svg
  28. '.pageHeader svg',
  29. '[class*="pageHeader"] svg',
  30. # 最后兜底:页面内第一个尺寸合理的 svg
  31. ]
  32. def _extract_icon_from_page(page) -> Dict:
  33. """
  34. 在已渲染的 page 上提取图标。
  35. 返回 {"type": "svg"|"img"|"none", "data": str, "selector": str}
  36. """
  37. result = page.evaluate(
  38. """
  39. () => {
  40. const selectors = [
  41. '[class*="modelIcon"] svg',
  42. '[class*="modelIcon"] img',
  43. '[class*="model-icon"] svg',
  44. '[class*="model-icon"] img',
  45. '[class*="currentModelIcon"]',
  46. '.pageHeader svg',
  47. '[class*="pageHeader"] svg',
  48. ];
  49. const isVisible = (el) => {
  50. if (!el) return false;
  51. const r = el.getBoundingClientRect();
  52. const s = window.getComputedStyle(el);
  53. return r.width > 0 && r.height > 0
  54. && s.display !== 'none'
  55. && s.visibility !== 'hidden'
  56. && s.opacity !== '0';
  57. };
  58. for (const sel of selectors) {
  59. const el = document.querySelector(sel);
  60. if (!el || !isVisible(el)) continue;
  61. if (el.tagName.toLowerCase() === 'svg') {
  62. // 克隆并清理,确保 SVG 有 xmlns
  63. const clone = el.cloneNode(true);
  64. if (!clone.getAttribute('xmlns')) {
  65. clone.setAttribute('xmlns', 'http://www.w3.org/2000/svg');
  66. }
  67. return { type: 'svg', data: clone.outerHTML, selector: sel };
  68. }
  69. if (el.tagName.toLowerCase() === 'img') {
  70. return { type: 'img', data: el.src || el.getAttribute('src'), selector: sel };
  71. }
  72. // 容器里找 svg/img
  73. const svg = el.querySelector('svg');
  74. if (svg && isVisible(svg)) {
  75. const clone = svg.cloneNode(true);
  76. if (!clone.getAttribute('xmlns')) {
  77. clone.setAttribute('xmlns', 'http://www.w3.org/2000/svg');
  78. }
  79. return { type: 'svg', data: clone.outerHTML, selector: sel + ' > svg' };
  80. }
  81. const img = el.querySelector('img');
  82. if (img && isVisible(img)) {
  83. return { type: 'img', data: img.src || img.getAttribute('src'), selector: sel + ' > img' };
  84. }
  85. }
  86. // 兜底:找页面内所有 svg,取尺寸在 24~200px 之间的第一个
  87. const allSvgs = Array.from(document.querySelectorAll('svg'));
  88. for (const svg of allSvgs) {
  89. if (!isVisible(svg)) continue;
  90. const r = svg.getBoundingClientRect();
  91. if (r.width >= 24 && r.width <= 200 && r.height >= 24 && r.height <= 200) {
  92. const clone = svg.cloneNode(true);
  93. if (!clone.getAttribute('xmlns')) {
  94. clone.setAttribute('xmlns', 'http://www.w3.org/2000/svg');
  95. }
  96. return { type: 'svg', data: clone.outerHTML, selector: 'svg[fallback]' };
  97. }
  98. }
  99. return { type: 'none', data: null, selector: null };
  100. }
  101. """
  102. )
  103. return result or {"type": "none", "data": None, "selector": None}
  104. def scrape_model_icon(
  105. url: str,
  106. headless: bool = True,
  107. timeout: int = 20000,
  108. executable_path: Optional[str] = None,
  109. save_svg: Optional[str] = None,
  110. screenshot: Optional[str] = None,
  111. ) -> Dict:
  112. """
  113. 抓取模型图标。
  114. 返回:
  115. {
  116. "url": str,
  117. "icon_type": "svg" | "img" | "none",
  118. "icon_data": str, # SVG outerHTML 或 img src URL
  119. "selector": str, # 命中的选择器
  120. "error": str | None
  121. }
  122. """
  123. result = {"url": url, "icon_type": "none", "icon_data": None, "selector": None, "error": None}
  124. with sync_playwright() as p:
  125. launch_kwargs = {"headless": headless}
  126. if executable_path:
  127. launch_kwargs["executable_path"] = executable_path
  128. browser = p.chromium.launch(**launch_kwargs)
  129. page = browser.new_context().new_page()
  130. try:
  131. page.goto(url, wait_until="networkidle", timeout=timeout)
  132. except PlaywrightTimeoutError:
  133. try:
  134. page.goto(url, wait_until="load", timeout=timeout)
  135. except Exception as e:
  136. result["error"] = f"导航失败: {e}"
  137. browser.close()
  138. return result
  139. # 等待页面主体内容出现
  140. for sel in ["text=模型介绍", "text=模型价格", '[class*="modelIcon"]', '[class*="pageHeader"]']:
  141. try:
  142. page.wait_for_selector(sel, timeout=6000)
  143. break
  144. except PlaywrightTimeoutError:
  145. pass
  146. time.sleep(1.0)
  147. icon = _extract_icon_from_page(page)
  148. result["icon_type"] = icon["type"]
  149. result["icon_data"] = icon["data"]
  150. result["selector"] = icon["selector"]
  151. # 如果是 img 且 src 是 SVG URL,直接下载内容转成 svg_data
  152. if icon["type"] == "img" and icon["data"] and icon["data"].endswith(".svg"):
  153. try:
  154. import urllib.request
  155. with urllib.request.urlopen(icon["data"], timeout=10) as resp:
  156. svg_content = resp.read().decode("utf-8")
  157. result["icon_type"] = "svg"
  158. result["icon_data"] = svg_content
  159. result["icon_url"] = icon["data"]
  160. icon = {**icon, "type": "svg", "data": svg_content}
  161. except Exception as e:
  162. result["fetch_error"] = str(e)
  163. # 保存 SVG 文件
  164. if save_svg and icon["type"] == "svg" and icon["data"]:
  165. os.makedirs(os.path.dirname(save_svg) or ".", exist_ok=True)
  166. with open(save_svg, "w", encoding="utf-8") as f:
  167. f.write(icon["data"])
  168. result["saved_svg"] = save_svg
  169. # 截图保存
  170. if screenshot:
  171. os.makedirs(os.path.dirname(screenshot) or ".", exist_ok=True)
  172. try:
  173. # 优先截图图标元素本身
  174. el = page.locator(
  175. '[class*="modelIcon"]'
  176. ).first
  177. if el.count() > 0:
  178. el.screenshot(path=screenshot)
  179. else:
  180. page.screenshot(path=screenshot, full_page=False)
  181. result["saved_screenshot"] = screenshot
  182. except Exception as e:
  183. result["screenshot_error"] = str(e)
  184. browser.close()
  185. return result
  186. def main():
  187. ap = argparse.ArgumentParser(description="爬取阿里云百炼模型图标(SVG/img)")
  188. ap.add_argument("--url", required=True, help="模型详情页 URL")
  189. ap.add_argument("--headful", action="store_true", help="有头模式(方便调试)")
  190. ap.add_argument("--timeout", type=int, default=20000)
  191. ap.add_argument("--browser-path", help="浏览器可执行文件路径")
  192. ap.add_argument("--save-svg", help="将 SVG 保存到指定路径,如 icons/qwen3-max.svg")
  193. ap.add_argument("--screenshot", help="将图标截图保存为 PNG,如 icons/qwen3-max.png")
  194. args = ap.parse_args()
  195. exec_path = args.browser_path or os.environ.get("PLAYWRIGHT_EXECUTABLE")
  196. headless = not args.headful
  197. if os.environ.get("PLAYWRIGHT_HEADLESS", "").lower() == "false":
  198. headless = False
  199. result = scrape_model_icon(
  200. url=args.url,
  201. headless=headless,
  202. timeout=args.timeout,
  203. executable_path=exec_path,
  204. save_svg=args.save_svg,
  205. screenshot=args.screenshot,
  206. )
  207. print(json.dumps(result, ensure_ascii=False, indent=2))
  208. if __name__ == "__main__":
  209. main()