| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243 |
- #!/usr/bin/env python3
- """
- Aliyun Model Icon Scraper
- 用 Playwright 渲染模型详情页,从 DOM 中提取模型图标(SVG 或 img)。
- 用法:
- python scrape_model_icon.py --url "https://bailian.console.aliyun.com/.../qwen3-max"
- python scrape_model_icon.py --url "..." --save-svg icons/qwen3-max.svg
- python scrape_model_icon.py --url "..." --screenshot icons/qwen3-max.png
- """
- import argparse
- import json
- import os
- import re
- import time
- from typing import Optional, Dict
- from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
- # 按优先级依次尝试的选择器
- # 卡片大图标区域(模型详情页左上角)
- _ICON_SELECTORS = [
- # 模型详情页 header 里的图标容器
- '[class*="modelIcon"] svg',
- '[class*="modelIcon"] img',
- '[class*="model-icon"] svg',
- '[class*="model-icon"] img',
- # 面包屑里的小图标(备用)
- '[class*="currentModelIcon"]',
- # 通用:页面 header 区域第一个 svg
- '.pageHeader svg',
- '[class*="pageHeader"] svg',
- # 最后兜底:页面内第一个尺寸合理的 svg
- ]
- def _extract_icon_from_page(page) -> Dict:
- """
- 在已渲染的 page 上提取图标。
- 返回 {"type": "svg"|"img"|"none", "data": str, "selector": str}
- """
- result = page.evaluate(
- """
- () => {
- const selectors = [
- '[class*="modelIcon"] svg',
- '[class*="modelIcon"] img',
- '[class*="model-icon"] svg',
- '[class*="model-icon"] img',
- '[class*="currentModelIcon"]',
- '.pageHeader svg',
- '[class*="pageHeader"] svg',
- ];
- const isVisible = (el) => {
- if (!el) return false;
- const r = el.getBoundingClientRect();
- const s = window.getComputedStyle(el);
- return r.width > 0 && r.height > 0
- && s.display !== 'none'
- && s.visibility !== 'hidden'
- && s.opacity !== '0';
- };
- for (const sel of selectors) {
- const el = document.querySelector(sel);
- if (!el || !isVisible(el)) continue;
- if (el.tagName.toLowerCase() === 'svg') {
- // 克隆并清理,确保 SVG 有 xmlns
- const clone = el.cloneNode(true);
- if (!clone.getAttribute('xmlns')) {
- clone.setAttribute('xmlns', 'http://www.w3.org/2000/svg');
- }
- return { type: 'svg', data: clone.outerHTML, selector: sel };
- }
- if (el.tagName.toLowerCase() === 'img') {
- return { type: 'img', data: el.src || el.getAttribute('src'), selector: sel };
- }
- // 容器里找 svg/img
- const svg = el.querySelector('svg');
- if (svg && isVisible(svg)) {
- const clone = svg.cloneNode(true);
- if (!clone.getAttribute('xmlns')) {
- clone.setAttribute('xmlns', 'http://www.w3.org/2000/svg');
- }
- return { type: 'svg', data: clone.outerHTML, selector: sel + ' > svg' };
- }
- const img = el.querySelector('img');
- if (img && isVisible(img)) {
- return { type: 'img', data: img.src || img.getAttribute('src'), selector: sel + ' > img' };
- }
- }
- // 兜底:找页面内所有 svg,取尺寸在 24~200px 之间的第一个
- const allSvgs = Array.from(document.querySelectorAll('svg'));
- for (const svg of allSvgs) {
- if (!isVisible(svg)) continue;
- const r = svg.getBoundingClientRect();
- if (r.width >= 24 && r.width <= 200 && r.height >= 24 && r.height <= 200) {
- const clone = svg.cloneNode(true);
- if (!clone.getAttribute('xmlns')) {
- clone.setAttribute('xmlns', 'http://www.w3.org/2000/svg');
- }
- return { type: 'svg', data: clone.outerHTML, selector: 'svg[fallback]' };
- }
- }
- return { type: 'none', data: null, selector: null };
- }
- """
- )
- return result or {"type": "none", "data": None, "selector": None}
- def scrape_model_icon(
- url: str,
- headless: bool = True,
- timeout: int = 20000,
- executable_path: Optional[str] = None,
- save_svg: Optional[str] = None,
- screenshot: Optional[str] = None,
- ) -> Dict:
- """
- 抓取模型图标。
- 返回:
- {
- "url": str,
- "icon_type": "svg" | "img" | "none",
- "icon_data": str, # SVG outerHTML 或 img src URL
- "selector": str, # 命中的选择器
- "error": str | None
- }
- """
- result = {"url": url, "icon_type": "none", "icon_data": None, "selector": None, "error": None}
- with sync_playwright() as p:
- launch_kwargs = {"headless": headless}
- if executable_path:
- launch_kwargs["executable_path"] = executable_path
- browser = p.chromium.launch(**launch_kwargs)
- page = browser.new_context().new_page()
- try:
- page.goto(url, wait_until="networkidle", timeout=timeout)
- except PlaywrightTimeoutError:
- try:
- page.goto(url, wait_until="load", timeout=timeout)
- except Exception as e:
- result["error"] = f"导航失败: {e}"
- browser.close()
- return result
- # 等待页面主体内容出现
- for sel in ["text=模型介绍", "text=模型价格", '[class*="modelIcon"]', '[class*="pageHeader"]']:
- try:
- page.wait_for_selector(sel, timeout=6000)
- break
- except PlaywrightTimeoutError:
- pass
- time.sleep(1.0)
- icon = _extract_icon_from_page(page)
- result["icon_type"] = icon["type"]
- result["icon_data"] = icon["data"]
- result["selector"] = icon["selector"]
- # 如果是 img 且 src 是 SVG URL,直接下载内容转成 svg_data
- if icon["type"] == "img" and icon["data"] and icon["data"].endswith(".svg"):
- try:
- import urllib.request
- with urllib.request.urlopen(icon["data"], timeout=10) as resp:
- svg_content = resp.read().decode("utf-8")
- result["icon_type"] = "svg"
- result["icon_data"] = svg_content
- result["icon_url"] = icon["data"]
- icon = {**icon, "type": "svg", "data": svg_content}
- except Exception as e:
- result["fetch_error"] = str(e)
- # 保存 SVG 文件
- if save_svg and icon["type"] == "svg" and icon["data"]:
- os.makedirs(os.path.dirname(save_svg) or ".", exist_ok=True)
- with open(save_svg, "w", encoding="utf-8") as f:
- f.write(icon["data"])
- result["saved_svg"] = save_svg
- # 截图保存
- if screenshot:
- os.makedirs(os.path.dirname(screenshot) or ".", exist_ok=True)
- try:
- # 优先截图图标元素本身
- el = page.locator(
- '[class*="modelIcon"]'
- ).first
- if el.count() > 0:
- el.screenshot(path=screenshot)
- else:
- page.screenshot(path=screenshot, full_page=False)
- result["saved_screenshot"] = screenshot
- except Exception as e:
- result["screenshot_error"] = str(e)
- browser.close()
- return result
- def main():
- ap = argparse.ArgumentParser(description="爬取阿里云百炼模型图标(SVG/img)")
- ap.add_argument("--url", required=True, help="模型详情页 URL")
- ap.add_argument("--headful", action="store_true", help="有头模式(方便调试)")
- ap.add_argument("--timeout", type=int, default=20000)
- ap.add_argument("--browser-path", help="浏览器可执行文件路径")
- ap.add_argument("--save-svg", help="将 SVG 保存到指定路径,如 icons/qwen3-max.svg")
- ap.add_argument("--screenshot", help="将图标截图保存为 PNG,如 icons/qwen3-max.png")
- args = ap.parse_args()
- exec_path = args.browser_path or os.environ.get("PLAYWRIGHT_EXECUTABLE")
- headless = not args.headful
- if os.environ.get("PLAYWRIGHT_HEADLESS", "").lower() == "false":
- headless = False
- result = scrape_model_icon(
- url=args.url,
- headless=headless,
- timeout=args.timeout,
- executable_path=exec_path,
- save_svg=args.save_svg,
- screenshot=args.screenshot,
- )
- print(json.dumps(result, ensure_ascii=False, indent=2))
- if __name__ == "__main__":
- main()
|