main.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. #!/usr/bin/env python3
  2. """
  3. main.py - 阿里云百炼模型完整信息抓取入口
  4. 整合以下模块,对每个 URL 只打开一次浏览器,依次运行所有抓取逻辑:
  5. - scrape_aliyun_models.py → 模型价格(含阶梯计费)
  6. - scrape_model_info.py → 模型基本信息 + 能力
  7. - scrape_rate_limits.py → 限流与上下文
  8. - scrape_tool_prices.py → 工具调用价格
  9. 用法:
  10. python main.py --url "https://bailian.console.aliyun.com/...#/model-market/detail/qwen3-max"
  11. python main.py --file urls.txt
  12. python main.py --url "..." --browser-path "D:\\playwright-browsers\\...\\chrome.exe"
  13. python main.py --url "..." --modules info,price,rate,tool # 只运行指定模块
  14. python main.py --url "..." --headful # 有头模式调试
  15. 输出: JSON 到 stdout,同时保存到 output/<model_id>.json
  16. """
  17. import argparse
  18. import json
  19. import os
  20. import re
  21. import time
  22. from typing import Dict, List, Optional
  23. from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
  24. # 导入各模块的核心解析函数(不启动独立浏览器)
  25. from scrape_model_info import (
  26. _extract_model_id_from_url,
  27. _find_model_in_json,
  28. parse_model_info,
  29. API_URL_RE as INFO_API_RE,
  30. )
  31. from scrape_rate_limits import (
  32. parse_rate_limits_from_text,
  33. _get_rate_limit_section_text,
  34. )
  35. from scrape_tool_prices import (
  36. parse_tool_prices_from_text,
  37. _get_tool_price_section_text,
  38. )
  39. from scrape_aliyun_models import (
  40. scrape_model_price,
  41. )
  42. from scrape_model_icon import _extract_icon_from_page
  43. def _navigate(page, url: str, timeout: int) -> bool:
  44. """导航到 URL,返回是否成功。"""
  45. try:
  46. page.goto(url, wait_until="networkidle", timeout=timeout)
  47. return True
  48. except PlaywrightTimeoutError:
  49. try:
  50. page.goto(url, wait_until="load", timeout=timeout)
  51. return True
  52. except Exception as e:
  53. print(f"[ERROR] 导航失败: {e}")
  54. return False
  55. def _wait_for_content(page) -> None:
  56. """等待页面核心内容渲染完成。"""
  57. for sel in ["text=模型价格", "text=模型介绍", "text=模型能力"]:
  58. try:
  59. page.wait_for_selector(sel, timeout=6000)
  60. break
  61. except PlaywrightTimeoutError:
  62. pass
  63. time.sleep(1.5)
  64. # 滚动触发懒加载
  65. try:
  66. page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
  67. time.sleep(0.8)
  68. page.evaluate("window.scrollTo(0, 0)")
  69. time.sleep(0.3)
  70. except Exception:
  71. pass
  72. def scrape_all(
  73. url: str,
  74. headless: bool = True,
  75. timeout: int = 20000,
  76. executable_path: Optional[str] = None,
  77. modules: Optional[List[str]] = None,
  78. api_key: Optional[str] = None,
  79. model_hint: Optional[str] = None,
  80. ) -> Dict:
  81. """
  82. 对单个 URL 运行所有(或指定)模块,共享一个浏览器实例。
  83. modules 可选值: ["info", "rate", "tool", "price"]
  84. 默认全部运行。
  85. api_key: 可选的 API 密钥,将通过请求头传递给目标站点。
  86. model_hint: 可选的模型名称提示,优先用于 API JSON 匹配,而不是从 URL 提取。
  87. """
  88. if modules is None:
  89. modules = ["info", "rate", "tool", "price", "icon"]
  90. # 优先用外部传入的 model_hint,否则从 URL 提取
  91. target = model_hint.strip() if model_hint and model_hint.strip() else _extract_model_id_from_url(url)
  92. result: Dict = {"url": url, "model_id": target, "error": None}
  93. # price 模块复用原始脚本,独立启动浏览器(原脚本结构限制)
  94. # 其余模块共享一个浏览器实例
  95. shared_modules = [m for m in modules if m != "price"]
  96. # ── 共享浏览器:info / rate / tool ──────────────────────────────────────────
  97. if shared_modules:
  98. api_data: List[Dict] = []
  99. with sync_playwright() as p:
  100. launch_kwargs: Dict = {"headless": headless}
  101. if executable_path:
  102. launch_kwargs["executable_path"] = executable_path
  103. # 额外 Chrome 启动参数(生产环境 Linux 可通过 PLAYWRIGHT_EXTRA_ARGS 注入)
  104. extra_args_env = os.environ.get("PLAYWRIGHT_EXTRA_ARGS", "")
  105. extra_args = [a.strip() for a in extra_args_env.split(",") if a.strip()]
  106. if extra_args:
  107. launch_kwargs["args"] = extra_args
  108. browser = p.chromium.launch(**launch_kwargs)
  109. # 如果有 api_key,通过额外请求头传递
  110. context_kwargs: Dict = {}
  111. if api_key:
  112. context_kwargs["extra_http_headers"] = {"Authorization": f"Bearer {api_key}"}
  113. page = browser.new_context(**context_kwargs).new_page()
  114. # 拦截 API 响应
  115. def on_response(resp):
  116. try:
  117. if "application/json" not in resp.headers.get("content-type", ""):
  118. return
  119. if not INFO_API_RE.search(resp.url):
  120. return
  121. try:
  122. api_data.append(resp.json())
  123. except Exception:
  124. pass
  125. except Exception:
  126. pass
  127. page.on("response", on_response)
  128. if not _navigate(page, url, timeout):
  129. result["error"] = "导航失败"
  130. browser.close()
  131. else:
  132. _wait_for_content(page)
  133. # 从 API 找模型对象
  134. model_obj = None
  135. for body in api_data:
  136. found = _find_model_in_json(body, target)
  137. if found:
  138. model_obj = found
  139. print(f"[INFO] API 找到模型: {found.get('model', found.get('name', target))}")
  140. break
  141. if not model_obj:
  142. print(f"[WARN] 未从 API 找到模型 '{target}',部分字段将为空")
  143. # ── info 模块 ──
  144. if "info" in shared_modules:
  145. if model_obj:
  146. result["info"] = parse_model_info(model_obj)
  147. else:
  148. result["info"] = {"error": f"未找到模型 '{target}'"}
  149. # ── rate 模块 ──
  150. if "rate" in shared_modules:
  151. rate_text = _get_rate_limit_section_text(page)
  152. result["rate_limits"] = parse_rate_limits_from_text(rate_text) if rate_text else {}
  153. # ── tool 模块 ──
  154. if "tool" in shared_modules:
  155. html = page.content()
  156. tool_text = _get_tool_price_section_text(html)
  157. result["tool_call_prices"] = parse_tool_prices_from_text(tool_text) if tool_text else []
  158. # ── icon 模块 ──
  159. if "icon" in shared_modules:
  160. icon = _extract_icon_from_page(page)
  161. result["icon"] = icon.get("data") if icon.get("type") != "none" else None
  162. browser.close()
  163. # ── price 模块(原始脚本,独立浏览器) ──────────────────────────────────────
  164. if "price" in modules:
  165. print(f"[INFO] 运行价格模块...")
  166. price_result = scrape_model_price(
  167. url,
  168. headless=headless,
  169. timeout=timeout,
  170. executable_path=executable_path,
  171. api_key=api_key,
  172. )
  173. result["prices"] = price_result.get("prices", {})
  174. if price_result.get("error"):
  175. result["price_error"] = price_result["error"]
  176. return result
  177. def main():
  178. ap = argparse.ArgumentParser(
  179. description="阿里云百炼模型完整信息抓取(整合所有模块)",
  180. formatter_class=argparse.RawDescriptionHelpFormatter,
  181. epilog="""
  182. 模块说明:
  183. info - 模型基本信息、能力、模态
  184. rate - 限流与上下文(RPM、context window 等)
  185. tool - 工具调用价格
  186. price - 模型 token 价格(含阶梯计费)
  187. 示例:
  188. python main.py --url "https://..." --browser-path "D:\\chrome.exe"
  189. python main.py --file urls.txt --headful
  190. python main.py --url "https://..." --modules info,rate
  191. """,
  192. )
  193. group = ap.add_mutually_exclusive_group(required=True)
  194. group.add_argument("--url", help="单个模型页面 URL")
  195. group.add_argument("--file", help="URL 列表文件(每行一个)")
  196. ap.add_argument("--headful", action="store_true", help="有头模式(方便调试)")
  197. ap.add_argument("--timeout", type=int, default=20000, help="导航超时毫秒,默认 20000")
  198. ap.add_argument("--browser-path", help="浏览器可执行文件路径")
  199. ap.add_argument(
  200. "--modules",
  201. default="info,rate,tool,price",
  202. help="要运行的模块,逗号分隔,可选: info,rate,tool,price(默认全部)",
  203. )
  204. ap.add_argument("--output-dir", default="output", help="结果保存目录,默认 output/")
  205. args = ap.parse_args()
  206. urls: List[str] = []
  207. if args.url:
  208. urls = [args.url]
  209. else:
  210. with open(args.file, "r", encoding="utf-8") as f:
  211. urls = [ln.strip() for ln in f if ln.strip()]
  212. exec_path = args.browser_path or os.environ.get("PLAYWRIGHT_EXECUTABLE")
  213. headless = not args.headful
  214. if os.environ.get("PLAYWRIGHT_HEADLESS", "").lower() == "false":
  215. headless = False
  216. modules = [m.strip() for m in args.modules.split(",") if m.strip()]
  217. print(f"[INFO] 运行模块: {modules}")
  218. os.makedirs(args.output_dir, exist_ok=True)
  219. all_results = []
  220. for u in urls:
  221. print(f"\n{'='*60}\n[INFO] 抓取: {u}", flush=True)
  222. res = scrape_all(u, headless=headless, timeout=args.timeout,
  223. executable_path=exec_path, modules=modules)
  224. all_results.append(res)
  225. # 保存单个结果
  226. model_id = res.get("model_id", "unknown")
  227. safe_id = re.sub(r"[^\w\-.]", "_", model_id)
  228. out_path = os.path.join(args.output_dir, f"{safe_id}.json")
  229. with open(out_path, "w", encoding="utf-8") as f:
  230. json.dump(res, f, ensure_ascii=False, indent=2)
  231. print(f"[INFO] 已保存: {out_path}")
  232. # 输出到 stdout
  233. print(json.dumps(all_results, ensure_ascii=False, indent=2))
  234. if __name__ == "__main__":
  235. main()