scrape_rate_limits.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. #!/usr/bin/env python3
  2. """
  3. scrape_rate_limits.py
  4. 抓取阿里云百炼模型"模型限流与上下文"区块,字段与页面完全对应:
  5. 最大输入长度、RPM、最大输入长度(思考)、上下文长度
  6. 最大输出长度、TPM、最大输出长度(思考)、最大思维链长度
  7. 原理:从页面文本直接提取,字段名和值与页面显示一致。
  8. """
  9. import re
  10. import time
  11. import json
  12. from typing import Dict, List, Optional
  13. from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
  14. # 页面字段名 -> 输出 key 映射(按截图顺序)
  15. FIELD_PATTERNS = [
  16. # (正则匹配页面文字, 输出 key)
  17. (r"最大输入长度[((]思考[))]", "最大输入长度(思考)"),
  18. (r"最大输入长度", "最大输入长度"),
  19. (r"最大输出长度[((]思考[))]", "最大输出长度(思考)"),
  20. (r"最大输出长度", "最大输出长度"),
  21. (r"上下文长度", "上下文长度"),
  22. (r"最大思维链长度", "最大思维链长度"),
  23. (r"\bRPM\b", "RPM"),
  24. (r"\bTPM\b", "TPM"),
  25. (r"\bQPM\b", "QPM"),
  26. ]
  27. # 值的格式:数字 + 可选单位(K/M/万 等)
  28. VALUE_RE = re.compile(r"(\d[\d,,]*(?:\.\d+)?\s*[KkMm万]?)")
  29. def _extract_model_id_from_url(url: str) -> str:
  30. m = re.search(r"#.*?/detail/([^/?#&]+)", url)
  31. if m:
  32. return m.group(1).strip()
  33. clean = re.sub(r"[?#].*", "", url)
  34. parts = [p for p in clean.rstrip("/").split("/") if p]
  35. return parts[-1] if parts else ""
  36. def _get_rate_limit_section_text(page) -> str:
  37. """从页面提取"模型限流与上下文"区块的文本。"""
  38. try:
  39. return page.evaluate("""
  40. () => {
  41. // 找"模型限流与上下文"标题节点
  42. const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
  43. let node;
  44. while ((node = walker.nextNode())) {
  45. if (/模型限流|限流与上下文/.test(node.textContent)) {
  46. let el = node.parentElement;
  47. for (let i = 0; i < 10; i++) {
  48. if (!el) break;
  49. const txt = (el.innerText || '').trim();
  50. // 找到包含数字和限流关键词的容器
  51. if (txt.length > 50 && /RPM|TPM|\\d+K/.test(txt)) return txt;
  52. el = el.parentElement;
  53. }
  54. }
  55. }
  56. return '';
  57. }
  58. """)
  59. except Exception:
  60. return ""
  61. def parse_rate_limits_from_text(text: str) -> Dict:
  62. """
  63. 从限流区块文本中提取字段,输出与页面完全对应。
  64. 文本示例(紧凑格式):
  65. 模型限流与上下文最大输入长度252KRPM30000最大输入长度(思考)252K上下文长度256K
  66. 最大输出长度64KTPM5000000最大输出长度(思考)32K最大思维链长度80K
  67. """
  68. result: Dict = {}
  69. # 把文本规范化:去掉多余空白
  70. text = re.sub(r"\s+", " ", text).strip()
  71. for pattern, key in FIELD_PATTERNS:
  72. if key in result:
  73. continue
  74. # 找字段名,然后取紧跟其后的数值
  75. m = re.search(pattern + r"\s*([0-9][0-9,,]*(?:\.\d+)?\s*[KkMm万]?)", text, re.I)
  76. if m:
  77. val = m.group(1).strip().replace(",", ",")
  78. # 统一大写 K
  79. val = re.sub(r"k$", "K", val)
  80. result[key] = val
  81. return result
  82. def scrape_rate_limits_standalone(
  83. url: str,
  84. headless: bool = True,
  85. timeout: int = 20000,
  86. executable_path: Optional[str] = None,
  87. ) -> Dict:
  88. """独立运行:启动浏览器,导航,抓取限流信息后关闭。"""
  89. from playwright.sync_api import sync_playwright
  90. target = _extract_model_id_from_url(url)
  91. result: Dict = {"url": url, "model_code": target, "error": None}
  92. with sync_playwright() as p:
  93. launch_kwargs: Dict = {"headless": headless}
  94. if executable_path:
  95. launch_kwargs["executable_path"] = executable_path
  96. browser = p.chromium.launch(**launch_kwargs)
  97. page = browser.new_context().new_page()
  98. try:
  99. page.goto(url, wait_until="networkidle", timeout=timeout)
  100. except PlaywrightTimeoutError:
  101. try:
  102. page.goto(url, wait_until="load", timeout=timeout)
  103. except Exception as e:
  104. result["error"] = f"导航失败: {e}"
  105. browser.close()
  106. return result
  107. for sel in ["text=模型限流", "text=上下文长度", "text=RPM"]:
  108. try:
  109. page.wait_for_selector(sel, timeout=6000)
  110. break
  111. except PlaywrightTimeoutError:
  112. pass
  113. time.sleep(1.0)
  114. # 滚动确保限流区块加载
  115. try:
  116. page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
  117. time.sleep(0.8)
  118. except Exception:
  119. pass
  120. text = _get_rate_limit_section_text(page)
  121. print(f"[DEBUG] 限流区块文本: {text[:200]}")
  122. if text:
  123. result["rate_limits"] = parse_rate_limits_from_text(text)
  124. else:
  125. result["error"] = "未找到模型限流与上下文区块"
  126. result["rate_limits"] = {}
  127. browser.close()
  128. return result
  129. if __name__ == "__main__":
  130. import argparse, os
  131. ap = argparse.ArgumentParser(description="抓取阿里云模型限流与上下文信息")
  132. group = ap.add_mutually_exclusive_group(required=True)
  133. group.add_argument("--url")
  134. group.add_argument("--file")
  135. ap.add_argument("--headful", action="store_true")
  136. ap.add_argument("--timeout", type=int, default=20000)
  137. ap.add_argument("--browser-path")
  138. args = ap.parse_args()
  139. urls = [args.url] if args.url else open(args.file, encoding="utf-8").read().splitlines()
  140. urls = [u.strip() for u in urls if u.strip()]
  141. exec_path = args.browser_path or os.environ.get("PLAYWRIGHT_EXECUTABLE")
  142. headless = not args.headful
  143. results = []
  144. for u in urls:
  145. print(f"抓取限流信息: {u}", flush=True)
  146. results.append(scrape_rate_limits_standalone(
  147. u, headless=headless, timeout=args.timeout, executable_path=exec_path
  148. ))
  149. print(json.dumps(results, ensure_ascii=False, indent=2))