|
|
@@ -0,0 +1,243 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+"""
|
|
|
+Aliyun Model Icon Scraper
|
|
|
+用 Playwright 渲染模型详情页,从 DOM 中提取模型图标(SVG 或 img)。
|
|
|
+
|
|
|
+用法:
|
|
|
+ python scrape_model_icon.py --url "https://bailian.console.aliyun.com/.../qwen3-max"
|
|
|
+ python scrape_model_icon.py --url "..." --save-svg icons/qwen3-max.svg
|
|
|
+ python scrape_model_icon.py --url "..." --screenshot icons/qwen3-max.png
|
|
|
+"""
|
|
|
+
|
|
|
+import argparse
|
|
|
+import json
|
|
|
+import os
|
|
|
+import re
|
|
|
+import time
|
|
|
+from typing import Optional, Dict
|
|
|
+
|
|
|
+from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
|
|
|
+
|
|
|
+
|
|
|
+# 按优先级依次尝试的选择器
|
|
|
+# 卡片大图标区域(模型详情页左上角)
|
|
|
+_ICON_SELECTORS = [
|
|
|
+ # 模型详情页 header 里的图标容器
|
|
|
+ '[class*="modelIcon"] svg',
|
|
|
+ '[class*="modelIcon"] img',
|
|
|
+ '[class*="model-icon"] svg',
|
|
|
+ '[class*="model-icon"] img',
|
|
|
+ # 面包屑里的小图标(备用)
|
|
|
+ '[class*="currentModelIcon"]',
|
|
|
+ # 通用:页面 header 区域第一个 svg
|
|
|
+ '.pageHeader svg',
|
|
|
+ '[class*="pageHeader"] svg',
|
|
|
+ # 最后兜底:页面内第一个尺寸合理的 svg
|
|
|
+]
|
|
|
+
|
|
|
+
|
|
|
+def _extract_icon_from_page(page) -> Dict:
|
|
|
+ """
|
|
|
+ 在已渲染的 page 上提取图标。
|
|
|
+ 返回 {"type": "svg"|"img"|"none", "data": str, "selector": str}
|
|
|
+ """
|
|
|
+ result = page.evaluate(
|
|
|
+ """
|
|
|
+ () => {
|
|
|
+ const selectors = [
|
|
|
+ '[class*="modelIcon"] svg',
|
|
|
+ '[class*="modelIcon"] img',
|
|
|
+ '[class*="model-icon"] svg',
|
|
|
+ '[class*="model-icon"] img',
|
|
|
+ '[class*="currentModelIcon"]',
|
|
|
+ '.pageHeader svg',
|
|
|
+ '[class*="pageHeader"] svg',
|
|
|
+ ];
|
|
|
+
|
|
|
+ const isVisible = (el) => {
|
|
|
+ if (!el) return false;
|
|
|
+ const r = el.getBoundingClientRect();
|
|
|
+ const s = window.getComputedStyle(el);
|
|
|
+ return r.width > 0 && r.height > 0
|
|
|
+ && s.display !== 'none'
|
|
|
+ && s.visibility !== 'hidden'
|
|
|
+ && s.opacity !== '0';
|
|
|
+ };
|
|
|
+
|
|
|
+ for (const sel of selectors) {
|
|
|
+ const el = document.querySelector(sel);
|
|
|
+ if (!el || !isVisible(el)) continue;
|
|
|
+
|
|
|
+ if (el.tagName.toLowerCase() === 'svg') {
|
|
|
+ // 克隆并清理,确保 SVG 有 xmlns
|
|
|
+ const clone = el.cloneNode(true);
|
|
|
+ if (!clone.getAttribute('xmlns')) {
|
|
|
+ clone.setAttribute('xmlns', 'http://www.w3.org/2000/svg');
|
|
|
+ }
|
|
|
+ return { type: 'svg', data: clone.outerHTML, selector: sel };
|
|
|
+ }
|
|
|
+
|
|
|
+ if (el.tagName.toLowerCase() === 'img') {
|
|
|
+ return { type: 'img', data: el.src || el.getAttribute('src'), selector: sel };
|
|
|
+ }
|
|
|
+
|
|
|
+ // 容器里找 svg/img
|
|
|
+ const svg = el.querySelector('svg');
|
|
|
+ if (svg && isVisible(svg)) {
|
|
|
+ const clone = svg.cloneNode(true);
|
|
|
+ if (!clone.getAttribute('xmlns')) {
|
|
|
+ clone.setAttribute('xmlns', 'http://www.w3.org/2000/svg');
|
|
|
+ }
|
|
|
+ return { type: 'svg', data: clone.outerHTML, selector: sel + ' > svg' };
|
|
|
+ }
|
|
|
+ const img = el.querySelector('img');
|
|
|
+ if (img && isVisible(img)) {
|
|
|
+ return { type: 'img', data: img.src || img.getAttribute('src'), selector: sel + ' > img' };
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 兜底:找页面内所有 svg,取尺寸在 24~200px 之间的第一个
|
|
|
+ const allSvgs = Array.from(document.querySelectorAll('svg'));
|
|
|
+ for (const svg of allSvgs) {
|
|
|
+ if (!isVisible(svg)) continue;
|
|
|
+ const r = svg.getBoundingClientRect();
|
|
|
+ if (r.width >= 24 && r.width <= 200 && r.height >= 24 && r.height <= 200) {
|
|
|
+ const clone = svg.cloneNode(true);
|
|
|
+ if (!clone.getAttribute('xmlns')) {
|
|
|
+ clone.setAttribute('xmlns', 'http://www.w3.org/2000/svg');
|
|
|
+ }
|
|
|
+ return { type: 'svg', data: clone.outerHTML, selector: 'svg[fallback]' };
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return { type: 'none', data: null, selector: null };
|
|
|
+ }
|
|
|
+ """
|
|
|
+ )
|
|
|
+ return result or {"type": "none", "data": None, "selector": None}
|
|
|
+
|
|
|
+
|
|
|
+def scrape_model_icon(
|
|
|
+ url: str,
|
|
|
+ headless: bool = True,
|
|
|
+ timeout: int = 20000,
|
|
|
+ executable_path: Optional[str] = None,
|
|
|
+ save_svg: Optional[str] = None,
|
|
|
+ screenshot: Optional[str] = None,
|
|
|
+) -> Dict:
|
|
|
+ """
|
|
|
+ 抓取模型图标。
|
|
|
+
|
|
|
+ 返回:
|
|
|
+ {
|
|
|
+ "url": str,
|
|
|
+ "icon_type": "svg" | "img" | "none",
|
|
|
+ "icon_data": str, # SVG outerHTML 或 img src URL
|
|
|
+ "selector": str, # 命中的选择器
|
|
|
+ "error": str | None
|
|
|
+ }
|
|
|
+ """
|
|
|
+ result = {"url": url, "icon_type": "none", "icon_data": None, "selector": None, "error": None}
|
|
|
+
|
|
|
+ with sync_playwright() as p:
|
|
|
+ launch_kwargs = {"headless": headless}
|
|
|
+ if executable_path:
|
|
|
+ launch_kwargs["executable_path"] = executable_path
|
|
|
+
|
|
|
+ browser = p.chromium.launch(**launch_kwargs)
|
|
|
+ page = browser.new_context().new_page()
|
|
|
+
|
|
|
+ try:
|
|
|
+ page.goto(url, wait_until="networkidle", timeout=timeout)
|
|
|
+ except PlaywrightTimeoutError:
|
|
|
+ try:
|
|
|
+ page.goto(url, wait_until="load", timeout=timeout)
|
|
|
+ except Exception as e:
|
|
|
+ result["error"] = f"导航失败: {e}"
|
|
|
+ browser.close()
|
|
|
+ return result
|
|
|
+
|
|
|
+ # 等待页面主体内容出现
|
|
|
+ for sel in ["text=模型介绍", "text=模型价格", '[class*="modelIcon"]', '[class*="pageHeader"]']:
|
|
|
+ try:
|
|
|
+ page.wait_for_selector(sel, timeout=6000)
|
|
|
+ break
|
|
|
+ except PlaywrightTimeoutError:
|
|
|
+ pass
|
|
|
+ time.sleep(1.0)
|
|
|
+
|
|
|
+ icon = _extract_icon_from_page(page)
|
|
|
+ result["icon_type"] = icon["type"]
|
|
|
+ result["icon_data"] = icon["data"]
|
|
|
+ result["selector"] = icon["selector"]
|
|
|
+
|
|
|
+ # 如果是 img 且 src 是 SVG URL,直接下载内容转成 svg_data
|
|
|
+ if icon["type"] == "img" and icon["data"] and icon["data"].endswith(".svg"):
|
|
|
+ try:
|
|
|
+ import urllib.request
|
|
|
+ with urllib.request.urlopen(icon["data"], timeout=10) as resp:
|
|
|
+ svg_content = resp.read().decode("utf-8")
|
|
|
+ result["icon_type"] = "svg"
|
|
|
+ result["icon_data"] = svg_content
|
|
|
+ result["icon_url"] = icon["data"]
|
|
|
+ icon = {**icon, "type": "svg", "data": svg_content}
|
|
|
+ except Exception as e:
|
|
|
+ result["fetch_error"] = str(e)
|
|
|
+
|
|
|
+ # 保存 SVG 文件
|
|
|
+ if save_svg and icon["type"] == "svg" and icon["data"]:
|
|
|
+ os.makedirs(os.path.dirname(save_svg) or ".", exist_ok=True)
|
|
|
+ with open(save_svg, "w", encoding="utf-8") as f:
|
|
|
+ f.write(icon["data"])
|
|
|
+ result["saved_svg"] = save_svg
|
|
|
+
|
|
|
+ # 截图保存
|
|
|
+ if screenshot:
|
|
|
+ os.makedirs(os.path.dirname(screenshot) or ".", exist_ok=True)
|
|
|
+ try:
|
|
|
+ # 优先截图图标元素本身
|
|
|
+ el = page.locator(
|
|
|
+ '[class*="modelIcon"]'
|
|
|
+ ).first
|
|
|
+ if el.count() > 0:
|
|
|
+ el.screenshot(path=screenshot)
|
|
|
+ else:
|
|
|
+ page.screenshot(path=screenshot, full_page=False)
|
|
|
+ result["saved_screenshot"] = screenshot
|
|
|
+ except Exception as e:
|
|
|
+ result["screenshot_error"] = str(e)
|
|
|
+
|
|
|
+ browser.close()
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ ap = argparse.ArgumentParser(description="爬取阿里云百炼模型图标(SVG/img)")
|
|
|
+ ap.add_argument("--url", required=True, help="模型详情页 URL")
|
|
|
+ ap.add_argument("--headful", action="store_true", help="有头模式(方便调试)")
|
|
|
+ ap.add_argument("--timeout", type=int, default=20000)
|
|
|
+ ap.add_argument("--browser-path", help="浏览器可执行文件路径")
|
|
|
+ ap.add_argument("--save-svg", help="将 SVG 保存到指定路径,如 icons/qwen3-max.svg")
|
|
|
+ ap.add_argument("--screenshot", help="将图标截图保存为 PNG,如 icons/qwen3-max.png")
|
|
|
+ args = ap.parse_args()
|
|
|
+
|
|
|
+ exec_path = args.browser_path or os.environ.get("PLAYWRIGHT_EXECUTABLE")
|
|
|
+ headless = not args.headful
|
|
|
+ if os.environ.get("PLAYWRIGHT_HEADLESS", "").lower() == "false":
|
|
|
+ headless = False
|
|
|
+
|
|
|
+ result = scrape_model_icon(
|
|
|
+ url=args.url,
|
|
|
+ headless=headless,
|
|
|
+ timeout=args.timeout,
|
|
|
+ executable_path=exec_path,
|
|
|
+ save_svg=args.save_svg,
|
|
|
+ screenshot=args.screenshot,
|
|
|
+ )
|
|
|
+
|
|
|
+ print(json.dumps(result, ensure_ascii=False, indent=2))
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|