main.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. #!/usr/bin/env python3
  2. """
  3. main.py - 阿里云百炼模型完整信息抓取入口
  4. 整合以下模块,对每个 URL 只打开一次浏览器,依次运行所有抓取逻辑:
  5. - scrape_aliyun_models.py → 模型价格(含阶梯计费)
  6. - scrape_model_info.py → 模型基本信息 + 能力
  7. - scrape_rate_limits.py → 限流与上下文
  8. - scrape_tool_prices.py → 工具调用价格
  9. 用法:
  10. python main.py --url "https://bailian.console.aliyun.com/...#/model-market/detail/qwen3-max"
  11. python main.py --file urls.txt
  12. python main.py --url "..." --browser-path "D:\\playwright-browsers\\...\\chrome.exe"
  13. python main.py --url "..." --modules info,price,rate,tool # 只运行指定模块
  14. python main.py --url "..." --headful # 有头模式调试
  15. 输出: JSON 到 stdout,同时保存到 output/<model_id>.json
  16. """
  17. import argparse
  18. import json
  19. import os
  20. import re
  21. import time
  22. from typing import Dict, List, Optional
  23. from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
  24. # 导入各模块的核心解析函数(不启动独立浏览器)
  25. from scrape_model_info import (
  26. _extract_model_id_from_url,
  27. _find_model_in_json,
  28. parse_model_info,
  29. API_URL_RE as INFO_API_RE,
  30. )
  31. from scrape_rate_limits import (
  32. parse_rate_limits_from_text,
  33. _get_rate_limit_section_text,
  34. )
  35. from scrape_tool_prices import (
  36. parse_tool_prices_from_text,
  37. _get_tool_price_section_text,
  38. )
  39. from scrape_aliyun_models import (
  40. scrape_model_price,
  41. )
  42. from scrape_model_icon import _extract_icon_from_page
  43. def _navigate(page, url: str, timeout: int) -> bool:
  44. """导航到 URL,返回是否成功。"""
  45. try:
  46. page.goto(url, wait_until="domcontentloaded", timeout=timeout)
  47. return True
  48. except PlaywrightTimeoutError:
  49. try:
  50. page.goto(url, wait_until="load", timeout=timeout)
  51. return True
  52. except Exception as e:
  53. print(f"[ERROR] 导航失败: {e}")
  54. return False
  55. def _wait_for_content(page) -> None:
  56. """等待页面核心内容渲染完成。"""
  57. for sel in ["text=模型价格", "text=模型介绍", "text=模型能力", "text=模型限流"]:
  58. try:
  59. page.wait_for_selector(sel, timeout=6000)
  60. break
  61. except PlaywrightTimeoutError:
  62. pass
  63. time.sleep(1.5)
  64. # 滚动触发懒加载
  65. try:
  66. page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
  67. time.sleep(0.8)
  68. page.evaluate("window.scrollTo(0, 0)")
  69. time.sleep(0.3)
  70. except Exception:
  71. pass
  72. def _parse_cookies_env(cookies_str: str, domain: str = ".aliyun.com") -> List[Dict]:
  73. """
  74. 将 Cookie 字符串(浏览器复制的 name=value; name2=value2 格式)
  75. 解析为 Playwright set_cookies 所需的列表格式。
  76. """
  77. cookies = []
  78. for part in cookies_str.split(";"):
  79. part = part.strip()
  80. if not part:
  81. continue
  82. if "=" in part:
  83. name, _, value = part.partition("=")
  84. cookies.append({
  85. "name": name.strip(),
  86. "value": value.strip(),
  87. "domain": domain,
  88. "path": "/",
  89. })
  90. return cookies
  91. def scrape_all(
  92. url: str,
  93. headless: bool = True,
  94. timeout: int = 20000,
  95. executable_path: Optional[str] = None,
  96. modules: Optional[List[str]] = None,
  97. api_key: Optional[str] = None,
  98. model_hint: Optional[str] = None,
  99. ) -> Dict:
  100. """
  101. 对单个 URL 运行所有(或指定)模块,共享一个浏览器实例。
  102. modules 可选值: ["info", "rate", "tool", "price"]
  103. 默认全部运行。
  104. api_key: 可选的 API 密钥,将通过请求头传递给目标站点。
  105. model_hint: 可选的模型名称提示,优先用于 API JSON 匹配,而不是从 URL 提取。
  106. """
  107. if modules is None:
  108. modules = ["info", "rate", "tool", "price", "icon"]
  109. # 优先用外部传入的 model_hint,否则从 URL 提取
  110. target = model_hint.strip() if model_hint and model_hint.strip() else _extract_model_id_from_url(url)
  111. result: Dict = {"url": url, "model_id": target, "error": None}
  112. # price 模块复用原始脚本,独立启动浏览器(原脚本结构限制)
  113. # 其余模块共享一个浏览器实例
  114. shared_modules = [m for m in modules if m != "price"]
  115. # 从环境变量读取登录 Cookie
  116. aliyun_cookies_str = os.environ.get("ALIYUN_COOKIES", "").strip()
  117. # ── 共享浏览器:info / rate / tool ──────────────────────────────────────────
  118. if shared_modules:
  119. api_data: List[Dict] = []
  120. with sync_playwright() as p:
  121. launch_kwargs: Dict = {"headless": headless}
  122. if executable_path:
  123. launch_kwargs["executable_path"] = executable_path
  124. # 额外 Chrome 启动参数(生产环境 Linux 可通过 PLAYWRIGHT_EXTRA_ARGS 注入)
  125. extra_args_env = os.environ.get("PLAYWRIGHT_EXTRA_ARGS", "")
  126. extra_args = [a.strip() for a in extra_args_env.split(",") if a.strip()]
  127. if extra_args:
  128. launch_kwargs["args"] = extra_args
  129. browser = p.chromium.launch(**launch_kwargs)
  130. # 如果有 api_key,通过额外请求头传递
  131. context_kwargs: Dict = {}
  132. if api_key:
  133. context_kwargs["extra_http_headers"] = {"Authorization": f"Bearer {api_key}"}
  134. context = browser.new_context(**context_kwargs)
  135. # 注入登录 Cookie(避免被重定向到登录/免费试用页)
  136. if aliyun_cookies_str:
  137. cookies = _parse_cookies_env(aliyun_cookies_str)
  138. if cookies:
  139. context.add_cookies(cookies)
  140. print(f"[INFO] 已注入 {len(cookies)} 个 Cookie")
  141. page = context.new_page()
  142. # 只拦截匹配 INFO_API_RE 的 JSON API 请求,其余直接放行
  143. # 避免对图片/日志等请求调用 route.fetch() 导致 DNS 失败崩溃
  144. import json as _json
  145. def handle_api_route(route, request):
  146. try:
  147. resp = route.fetch()
  148. try:
  149. ct = resp.headers.get("content-type", "")
  150. if "application/json" in ct:
  151. api_data.append(_json.loads(resp.body()))
  152. except Exception:
  153. pass
  154. route.fulfill(response=resp)
  155. except Exception as e:
  156. try:
  157. route.continue_()
  158. except Exception:
  159. pass
  160. # 只对匹配 API 的 URL 注册拦截,其余请求不拦截(直接走浏览器默认行为)
  161. page.route(
  162. lambda url: bool(INFO_API_RE.search(url)),
  163. handle_api_route,
  164. )
  165. if not _navigate(page, url, timeout):
  166. result["error"] = "导航失败"
  167. browser.close()
  168. else:
  169. try:
  170. page.wait_for_load_state("networkidle", timeout=20000)
  171. except PlaywrightTimeoutError:
  172. pass
  173. _wait_for_content(page)
  174. # 从 API 找模型对象
  175. model_obj = None
  176. for body in api_data:
  177. found = _find_model_in_json(body, target)
  178. if found:
  179. model_obj = found
  180. break
  181. if not model_obj:
  182. print(f"[WARN] 未从 API 找到模型 '{target}',部分字段将为空")
  183. # ── info 模块 ──
  184. if "info" in shared_modules:
  185. if model_obj:
  186. result["info"] = parse_model_info(model_obj)
  187. else:
  188. result["info"] = {"error": f"未找到模型 '{target}'"}
  189. # ── rate 模块 ──
  190. if "rate" in shared_modules:
  191. rate_text = _get_rate_limit_section_text(page)
  192. result["rate_limits"] = parse_rate_limits_from_text(rate_text) if rate_text else {}
  193. # ── tool 模块 ──
  194. if "tool" in shared_modules:
  195. html = page.content()
  196. tool_text = _get_tool_price_section_text(html)
  197. result["tool_call_prices"] = parse_tool_prices_from_text(tool_text) if tool_text else []
  198. # ── icon 模块 ──
  199. if "icon" in shared_modules:
  200. icon = _extract_icon_from_page(page)
  201. result["icon"] = icon.get("data") if icon.get("type") != "none" else None
  202. # ── price 模块(复用共享浏览器) ──
  203. if "price" in modules:
  204. try:
  205. from scrape_aliyun_models import (
  206. extract_price_items_from_html,
  207. extract_price_block_html,
  208. parse_prices_from_text,
  209. _ensure_tiered_pricing,
  210. _get_tier_options,
  211. _select_tier_option,
  212. _normalize_tier_option,
  213. )
  214. import time as _time
  215. _ensure_tiered_pricing(page)
  216. tier_options = _get_tier_options(page)
  217. tiered_items = []
  218. if tier_options:
  219. for opt in tier_options:
  220. if not _select_tier_option(page, opt):
  221. continue
  222. html = page.content()
  223. try:
  224. tier_items = extract_price_items_from_html(html)
  225. except Exception:
  226. tier_items = []
  227. for it in tier_items:
  228. it["tier"] = opt
  229. tiered_items.extend(tier_items)
  230. if tiered_items:
  231. items = tiered_items
  232. else:
  233. html = page.content()
  234. items = extract_price_items_from_html(html)
  235. if not items:
  236. text_block = extract_price_block_html(html)
  237. items = parse_prices_from_text(text_block) if text_block else []
  238. # 构建 price_map(复用 scrape_model_price 里的逻辑)
  239. def _build_price_map(parsed_items):
  240. price_map = {}
  241. for it in parsed_items:
  242. if isinstance(it, dict) and it.get("tiers") and isinstance(it.get("tiers"), dict):
  243. for tier_key, tier_val in it["tiers"].items():
  244. k = _normalize_tier_option(tier_key)
  245. price_map.setdefault(k, {})
  246. sub_label = tier_val.get("label") or tier_val.get("raw") or k
  247. price_map[k][sub_label] = {kk: v for kk, v in tier_val.items() if kk not in ("tier", "tiers", "label")}
  248. continue
  249. if it.get("tier"):
  250. tk = _normalize_tier_option(it.get("tier"))
  251. price_map.setdefault(tk, {})
  252. sub_label = it.get("label") or it.get("raw") or tk
  253. price_map[tk][sub_label] = {kk: v for kk, v in it.items() if kk not in ("tier", "label")}
  254. continue
  255. lbl = it.get("label") or it.get("raw") or "price"
  256. if lbl in price_map and not isinstance(price_map[lbl], list):
  257. price_map[lbl] = [price_map[lbl]]
  258. if isinstance(price_map.get(lbl), list):
  259. price_map[lbl].append({kk: v for kk, v in it.items() if kk != "label"})
  260. else:
  261. price_map[lbl] = {kk: v for kk, v in it.items() if kk != "label"}
  262. return price_map
  263. result["prices"] = _build_price_map(items)
  264. except Exception as e:
  265. import traceback as _tb
  266. print(f"[ERROR] 价格模块异常: {e}\n{_tb.format_exc()}")
  267. result["prices"] = {}
  268. result["price_error"] = str(e)
  269. browser.close()
  270. # ── price 模块回退:若 shared_modules 为空(不含 info/rate/tool),独立启动浏览器 ──
  271. if "price" in modules and not shared_modules:
  272. print(f"[INFO] 运行价格模块(独立浏览器)...")
  273. try:
  274. price_result = scrape_model_price(
  275. url,
  276. headless=headless,
  277. timeout=timeout,
  278. executable_path=executable_path,
  279. api_key=api_key,
  280. cookies_str=aliyun_cookies_str,
  281. )
  282. result["prices"] = price_result.get("prices", {})
  283. if price_result.get("error"):
  284. result["price_error"] = price_result["error"]
  285. print(f"[WARN] 价格模块错误: {price_result['error']}")
  286. except Exception as e:
  287. import traceback as _tb
  288. print(f"[ERROR] 价格模块异常: {e}\n{_tb.format_exc()}")
  289. result["prices"] = {}
  290. result["price_error"] = str(e)
  291. return result
  292. def main():
  293. ap = argparse.ArgumentParser(
  294. description="阿里云百炼模型完整信息抓取(整合所有模块)",
  295. formatter_class=argparse.RawDescriptionHelpFormatter,
  296. epilog="""
  297. 模块说明:
  298. info - 模型基本信息、能力、模态
  299. rate - 限流与上下文(RPM、context window 等)
  300. tool - 工具调用价格
  301. price - 模型 token 价格(含阶梯计费)
  302. 示例:
  303. python main.py --url "https://..." --browser-path "D:\\chrome.exe"
  304. python main.py --file urls.txt --headful
  305. python main.py --url "https://..." --modules info,rate
  306. """,
  307. )
  308. group = ap.add_mutually_exclusive_group(required=True)
  309. group.add_argument("--url", help="单个模型页面 URL")
  310. group.add_argument("--file", help="URL 列表文件(每行一个)")
  311. ap.add_argument("--headful", action="store_true", help="有头模式(方便调试)")
  312. ap.add_argument("--timeout", type=int, default=20000, help="导航超时毫秒,默认 20000")
  313. ap.add_argument("--browser-path", help="浏览器可执行文件路径")
  314. ap.add_argument(
  315. "--modules",
  316. default="info,rate,tool,price",
  317. help="要运行的模块,逗号分隔,可选: info,rate,tool,price(默认全部)",
  318. )
  319. ap.add_argument("--output-dir", default="output", help="结果保存目录,默认 output/")
  320. args = ap.parse_args()
  321. urls: List[str] = []
  322. if args.url:
  323. urls = [args.url]
  324. else:
  325. with open(args.file, "r", encoding="utf-8") as f:
  326. urls = [ln.strip() for ln in f if ln.strip()]
  327. exec_path = args.browser_path or os.environ.get("PLAYWRIGHT_EXECUTABLE")
  328. headless = not args.headful
  329. if os.environ.get("PLAYWRIGHT_HEADLESS", "").lower() == "false":
  330. headless = False
  331. modules = [m.strip() for m in args.modules.split(",") if m.strip()]
  332. print(f"[INFO] 运行模块: {modules}")
  333. os.makedirs(args.output_dir, exist_ok=True)
  334. all_results = []
  335. for u in urls:
  336. print(f"\n{'='*60}\n[INFO] 抓取: {u}", flush=True)
  337. res = scrape_all(u, headless=headless, timeout=args.timeout,
  338. executable_path=exec_path, modules=modules)
  339. all_results.append(res)
  340. # 保存单个结果
  341. model_id = res.get("model_id", "unknown")
  342. safe_id = re.sub(r"[^\w\-.]", "_", model_id)
  343. out_path = os.path.join(args.output_dir, f"{safe_id}.json")
  344. with open(out_path, "w", encoding="utf-8") as f:
  345. json.dump(res, f, ensure_ascii=False, indent=2)
  346. print(f"[INFO] 已保存: {out_path}")
  347. # 输出到 stdout
  348. print(json.dumps(all_results, ensure_ascii=False, indent=2))
  349. if __name__ == "__main__":
  350. main()