#!/usr/bin/env python3 """ Aliyun Model Icon Scraper 用 Playwright 渲染模型详情页,从 DOM 中提取模型图标(SVG 或 img)。 用法: python scrape_model_icon.py --url "https://bailian.console.aliyun.com/.../qwen3-max" python scrape_model_icon.py --url "..." --save-svg icons/qwen3-max.svg python scrape_model_icon.py --url "..." --screenshot icons/qwen3-max.png """ import argparse import json import os import re import time from typing import Optional, Dict from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError # 按优先级依次尝试的选择器 # 卡片大图标区域(模型详情页左上角) _ICON_SELECTORS = [ # 模型详情页 header 里的图标容器 '[class*="modelIcon"] svg', '[class*="modelIcon"] img', '[class*="model-icon"] svg', '[class*="model-icon"] img', # 面包屑里的小图标(备用) '[class*="currentModelIcon"]', # 通用:页面 header 区域第一个 svg '.pageHeader svg', '[class*="pageHeader"] svg', # 最后兜底:页面内第一个尺寸合理的 svg ] def _extract_icon_from_page(page) -> Dict: """ 在已渲染的 page 上提取图标。 返回 {"type": "svg"|"img"|"none", "data": str, "selector": str} """ result = page.evaluate( """ () => { const selectors = [ '[class*="modelIcon"] svg', '[class*="modelIcon"] img', '[class*="model-icon"] svg', '[class*="model-icon"] img', '[class*="currentModelIcon"]', '.pageHeader svg', '[class*="pageHeader"] svg', ]; const isVisible = (el) => { if (!el) return false; const r = el.getBoundingClientRect(); const s = window.getComputedStyle(el); return r.width > 0 && r.height > 0 && s.display !== 'none' && s.visibility !== 'hidden' && s.opacity !== '0'; }; for (const sel of selectors) { const el = document.querySelector(sel); if (!el || !isVisible(el)) continue; if (el.tagName.toLowerCase() === 'svg') { // 克隆并清理,确保 SVG 有 xmlns const clone = el.cloneNode(true); if (!clone.getAttribute('xmlns')) { clone.setAttribute('xmlns', 'http://www.w3.org/2000/svg'); } return { type: 'svg', data: clone.outerHTML, selector: sel }; } if (el.tagName.toLowerCase() === 'img') { return { type: 'img', data: el.src || el.getAttribute('src'), selector: sel }; } // 容器里找 svg/img const svg = el.querySelector('svg'); if (svg && isVisible(svg)) { const clone = svg.cloneNode(true); if (!clone.getAttribute('xmlns')) { clone.setAttribute('xmlns', 'http://www.w3.org/2000/svg'); } return { type: 'svg', data: clone.outerHTML, selector: sel + ' > svg' }; } const img = el.querySelector('img'); if (img && isVisible(img)) { return { type: 'img', data: img.src || img.getAttribute('src'), selector: sel + ' > img' }; } } // 兜底:找页面内所有 svg,取尺寸在 24~200px 之间的第一个 const allSvgs = Array.from(document.querySelectorAll('svg')); for (const svg of allSvgs) { if (!isVisible(svg)) continue; const r = svg.getBoundingClientRect(); if (r.width >= 24 && r.width <= 200 && r.height >= 24 && r.height <= 200) { const clone = svg.cloneNode(true); if (!clone.getAttribute('xmlns')) { clone.setAttribute('xmlns', 'http://www.w3.org/2000/svg'); } return { type: 'svg', data: clone.outerHTML, selector: 'svg[fallback]' }; } } return { type: 'none', data: null, selector: null }; } """ ) return result or {"type": "none", "data": None, "selector": None} def scrape_model_icon( url: str, headless: bool = True, timeout: int = 20000, executable_path: Optional[str] = None, save_svg: Optional[str] = None, screenshot: Optional[str] = None, ) -> Dict: """ 抓取模型图标。 返回: { "url": str, "icon_type": "svg" | "img" | "none", "icon_data": str, # SVG outerHTML 或 img src URL "selector": str, # 命中的选择器 "error": str | None } """ result = {"url": url, "icon_type": "none", "icon_data": None, "selector": None, "error": None} with sync_playwright() as p: launch_kwargs = {"headless": headless} if executable_path: launch_kwargs["executable_path"] = executable_path browser = p.chromium.launch(**launch_kwargs) page = browser.new_context().new_page() try: page.goto(url, wait_until="networkidle", timeout=timeout) except PlaywrightTimeoutError: try: page.goto(url, wait_until="load", timeout=timeout) except Exception as e: result["error"] = f"导航失败: {e}" browser.close() return result # 等待页面主体内容出现 for sel in ["text=模型介绍", "text=模型价格", '[class*="modelIcon"]', '[class*="pageHeader"]']: try: page.wait_for_selector(sel, timeout=6000) break except PlaywrightTimeoutError: pass time.sleep(1.0) icon = _extract_icon_from_page(page) result["icon_type"] = icon["type"] result["icon_data"] = icon["data"] result["selector"] = icon["selector"] # 如果是 img 且 src 是 SVG URL,直接下载内容转成 svg_data if icon["type"] == "img" and icon["data"] and icon["data"].endswith(".svg"): try: import urllib.request with urllib.request.urlopen(icon["data"], timeout=10) as resp: svg_content = resp.read().decode("utf-8") result["icon_type"] = "svg" result["icon_data"] = svg_content result["icon_url"] = icon["data"] icon = {**icon, "type": "svg", "data": svg_content} except Exception as e: result["fetch_error"] = str(e) # 保存 SVG 文件 if save_svg and icon["type"] == "svg" and icon["data"]: os.makedirs(os.path.dirname(save_svg) or ".", exist_ok=True) with open(save_svg, "w", encoding="utf-8") as f: f.write(icon["data"]) result["saved_svg"] = save_svg # 截图保存 if screenshot: os.makedirs(os.path.dirname(screenshot) or ".", exist_ok=True) try: # 优先截图图标元素本身 el = page.locator( '[class*="modelIcon"]' ).first if el.count() > 0: el.screenshot(path=screenshot) else: page.screenshot(path=screenshot, full_page=False) result["saved_screenshot"] = screenshot except Exception as e: result["screenshot_error"] = str(e) browser.close() return result def main(): ap = argparse.ArgumentParser(description="爬取阿里云百炼模型图标(SVG/img)") ap.add_argument("--url", required=True, help="模型详情页 URL") ap.add_argument("--headful", action="store_true", help="有头模式(方便调试)") ap.add_argument("--timeout", type=int, default=20000) ap.add_argument("--browser-path", help="浏览器可执行文件路径") ap.add_argument("--save-svg", help="将 SVG 保存到指定路径,如 icons/qwen3-max.svg") ap.add_argument("--screenshot", help="将图标截图保存为 PNG,如 icons/qwen3-max.png") args = ap.parse_args() exec_path = args.browser_path or os.environ.get("PLAYWRIGHT_EXECUTABLE") headless = not args.headful if os.environ.get("PLAYWRIGHT_HEADLESS", "").lower() == "false": headless = False result = scrape_model_icon( url=args.url, headless=headless, timeout=args.timeout, executable_path=exec_path, save_svg=args.save_svg, screenshot=args.screenshot, ) print(json.dumps(result, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()