scrape_aliyun_models.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816
  1. #!/usr/bin/env python3
  2. """
  3. Aliyun Model Price Scraper - Final Improved Version
  4. - 使用 Playwright 渲染页面并抓取"模型价格"区域内的价格信息
  5. - 支持单个模型页面 URL,或从文件读取多个 URL
  6. 改进要点:
  7. 1. 能够生成阶梯计费结构:{input: {tier1: {...}, tier2: {...}}, output: {...}}
  8. 2. 优惠标记正确处理:label只保留基础部分,优惠信息放入note字段
  9. 3. 强化过滤:完全排除工具调用价格(包括"千次调用"单位)
  10. 依赖:
  11. pip install playwright beautifulsoup4 lxml
  12. python -m playwright install
  13. 用法示例:
  14. python scrape_aliyun_models.py --url "https://bailian.console.aliyun.com/.../qwen3-max"
  15. python scrape_aliyun_models.py --file urls.txt
  16. 输出: JSON 到 stdout
  17. """
  18. import argparse
  19. import json
  20. import re
  21. import time
  22. import os
  23. from typing import List, Dict, Optional
  24. from bs4 import BeautifulSoup, FeatureNotFound
  25. from bs4.element import Tag
  26. from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
  27. TOOL_CALL_RE = re.compile(
  28. r"调用|工具|接口|api调用|api|次调用|千次调用|/千次|每千次|搜索策略|代码解释|文生图|数据增强|模型推理",
  29. re.I,
  30. )
  31. def _is_tool_call_item(label: str, raw: str, unit: str) -> bool:
  32. label_l = label.lower()
  33. raw_l = raw.lower()
  34. unit_l = unit.lower()
  35. if TOOL_CALL_RE.search(label_l) or TOOL_CALL_RE.search(raw_l) or TOOL_CALL_RE.search(unit_l):
  36. return True
  37. if "千次" in unit_l or "/千" in unit_l or "次调用" in unit_l:
  38. return True
  39. return False
  40. def _find_nearest_tier_label(lines: List[str], idx: int) -> Optional[str]:
  41. tier_re = re.compile(r"(输入|输出).*(<=|>=|<|>|\b\d+\s*k|\d+\s*万|\d+\s*千|\d+\s*tokens?)", re.I)
  42. for step in range(1, 6):
  43. for pos in (idx - step, idx + step):
  44. if pos < 0 or pos >= len(lines):
  45. continue
  46. candidate = lines[pos]
  47. if not candidate or re.search(r"([0-9]+(?:\.[0-9]+)?)\s*元", candidate, re.I):
  48. continue
  49. if tier_re.search(candidate):
  50. return candidate.strip()
  51. return None
  52. def _open_tier_dropdown(page) -> bool:
  53. try:
  54. try:
  55. selector = page.locator(".efm_ant-select-selector, .ant-select-selector").filter(has_text=re.compile(r"输入.*\d+\s*[kK]"))
  56. if selector.count() > 0:
  57. selector.first.click(timeout=3000)
  58. time.sleep(0.5)
  59. return True
  60. except Exception as e:
  61. pass
  62. ok = page.evaluate(
  63. """
  64. () => {
  65. const isVisible = (el) => {
  66. if (!el) return false;
  67. const rect = el.getBoundingClientRect();
  68. const style = window.getComputedStyle(el);
  69. return rect.width > 0 && rect.height > 0 && style.display !== 'none' && style.visibility !== 'hidden';
  70. };
  71. const norm = (s) => (s || '').replace(/\s+/g, ' ').trim();
  72. const tierRe = /输入.*\d+\s*[kK]/i;
  73. let clickEl = null;
  74. const selectors = Array.from(document.querySelectorAll(
  75. ".efm_ant-select-selector, .ant-select-selector"
  76. ));
  77. for (const el of selectors) {
  78. const txt = norm(el.innerText || el.textContent);
  79. if (tierRe.test(txt) && isVisible(el)) {
  80. clickEl = el;
  81. break;
  82. }
  83. }
  84. if (!clickEl) {
  85. const containers = Array.from(document.querySelectorAll(
  86. ".efm_ant-select, .ant-select"
  87. ));
  88. for (const el of containers) {
  89. const txt = norm(el.innerText || el.textContent);
  90. if (tierRe.test(txt) && isVisible(el)) {
  91. clickEl = el.querySelector(".efm_ant-select-selector, .ant-select-selector") || el;
  92. break;
  93. }
  94. }
  95. }
  96. if (!isVisible(clickEl)) return false;
  97. clickEl.dispatchEvent(new MouseEvent('mousedown', { bubbles: true }));
  98. clickEl.dispatchEvent(new MouseEvent('mouseup', { bubbles: true }));
  99. clickEl.click();
  100. return true;
  101. }
  102. """
  103. )
  104. time.sleep(0.5)
  105. return bool(ok)
  106. except Exception:
  107. return False
  108. def _normalize_tier_option(opt: str) -> str:
  109. if not opt:
  110. return "unknown"
  111. s = opt.replace('\u00a0', ' ')
  112. m = re.search(r"(\d+\s*k\s*<\s*输入\s*<=\s*\d+\s*k)", s, re.I)
  113. if not m:
  114. m = re.search(r"(输入\s*<=\s*\d+\s*k)", s, re.I)
  115. if not m:
  116. m = re.search(r"(\d+\s*k\s*<\s*输入)", s, re.I)
  117. if m:
  118. key = m.group(1)
  119. key = re.sub(r"\s+", "", key)
  120. key = key.replace("输入", "input").replace("输出", "output")
  121. return key
  122. if "输入" in s or "输出" in s:
  123. nums = re.findall(r"\d+\s*k", s, re.I)
  124. if nums:
  125. joined = "-".join([n.replace(' ', '') for n in nums])
  126. if "输入" in s:
  127. return f"input_{joined}"
  128. return f"output_{joined}"
  129. short = re.sub(r"\s+", " ", s).strip()
  130. return short[:60]
  131. def _get_tier_options(page) -> List[str]:
  132. if not _open_tier_dropdown(page):
  133. return []
  134. try:
  135. page.wait_for_selector(
  136. ".efm_ant-select-dropdown, .ant-select-dropdown",
  137. state="visible", timeout=3000
  138. )
  139. except Exception:
  140. pass
  141. options = []
  142. try:
  143. options = page.evaluate(
  144. """
  145. () => {
  146. const isVisible = (el) => {
  147. const r = el.getBoundingClientRect();
  148. const s = window.getComputedStyle(el);
  149. return r.width > 0 && r.height > 0 && s.display !== 'none' && s.visibility !== 'hidden';
  150. };
  151. const dropdown = Array.from(document.querySelectorAll(
  152. '.efm_ant-select-dropdown, .ant-select-dropdown'
  153. )).find(el => isVisible(el));
  154. if (!dropdown) return [];
  155. const leaves = Array.from(dropdown.querySelectorAll('*'))
  156. .filter(el => isVisible(el) && el.children.length === 0);
  157. const texts = leaves
  158. .map(el => (el.innerText || el.textContent || '').replace(/\\s+/g, ' ').trim())
  159. .filter(t => t.length > 0 && t.length < 60);
  160. return Array.from(new Set(texts));
  161. }
  162. """
  163. )
  164. options = [t for t in options if re.search(r"输入", t) and re.search(r"\d+\s*[kK]", t)]
  165. except Exception:
  166. options = []
  167. if not options:
  168. try:
  169. options = page.evaluate(
  170. """
  171. () => {
  172. const isVisible = (el) => {
  173. const r = el.getBoundingClientRect();
  174. const s = window.getComputedStyle(el);
  175. return r.width > 0 && r.height > 0 && s.display !== 'none' && s.visibility !== 'hidden';
  176. };
  177. const texts = Array.from(document.querySelectorAll('*'))
  178. .filter(el => isVisible(el) && el.children.length === 0)
  179. .map(el => (el.innerText || el.textContent || '').replace(/\\s+/g, ' ').trim())
  180. .filter(t => t.length < 60 && /输入/.test(t) && /\\d+\\s*[kK]/.test(t) && /<=|</.test(t));
  181. return Array.from(new Set(texts));
  182. }
  183. """
  184. )
  185. except Exception:
  186. options = []
  187. try:
  188. page.keyboard.press("Escape")
  189. except Exception:
  190. pass
  191. return list(dict.fromkeys(options))
  192. def _select_tier_option(page, option_text: str) -> bool:
  193. if not _open_tier_dropdown(page):
  194. return False
  195. try:
  196. page.wait_for_selector(
  197. ".efm_ant-select-dropdown, .ant-select-dropdown",
  198. state="visible", timeout=2000,
  199. )
  200. except Exception:
  201. return False
  202. try:
  203. try:
  204. option_loc = page.get_by_text(option_text, exact=True).first
  205. option_loc.click(timeout=3000, force=False)
  206. time.sleep(0.6)
  207. return True
  208. except Exception:
  209. pass
  210. clicked = page.evaluate(
  211. """
  212. (opt) => {
  213. const isVisible = (el) => {
  214. if (!el) return false;
  215. const rect = el.getBoundingClientRect();
  216. const style = window.getComputedStyle(el);
  217. return rect.width > 0 && rect.height > 0 && style.display !== 'none' && style.visibility !== 'hidden';
  218. };
  219. const norm = (s) => (s || '').replace(/\s+/g, ' ').trim();
  220. const nodes = Array.from(document.querySelectorAll(
  221. ".efm_ant-select-item-option-content, [role='option'], .efm_ant-select-item, .ant-select-item"
  222. ));
  223. const target = nodes.find((n) => norm(n.textContent) === opt && isVisible(n));
  224. if (!target) return false;
  225. const clickEl = target.closest(".efm_ant-select-item, [role='option']") || target;
  226. clickEl.dispatchEvent(new MouseEvent('mousedown', { bubbles: true }));
  227. clickEl.dispatchEvent(new MouseEvent('mouseup', { bubbles: true }));
  228. clickEl.click();
  229. return true;
  230. }
  231. """,
  232. option_text,
  233. )
  234. if clicked:
  235. time.sleep(0.6)
  236. return True
  237. return False
  238. except Exception:
  239. return False
  240. def _ensure_tiered_pricing(page) -> None:
  241. try:
  242. toggle = page.locator("text=阶梯计费").first
  243. if toggle.count() > 0:
  244. toggle.click()
  245. time.sleep(0.3)
  246. except Exception:
  247. pass
  248. def parse_prices_from_text(text: str) -> List[Dict]:
  249. lines = [ln.strip() for ln in text.splitlines()]
  250. lines = [ln for ln in lines if ln]
  251. items = []
  252. price_re = re.compile(r"([0-9]+(?:\.[0-9]+)?)\s*元", re.I)
  253. for idx, ln in enumerate(lines):
  254. matches = price_re.findall(ln)
  255. if not matches:
  256. continue
  257. label = None
  258. first_m = price_re.search(ln)
  259. if first_m:
  260. before = ln[: first_m.start()].strip()
  261. if before:
  262. label = before
  263. if not label:
  264. for j in range(idx - 1, -1, -1):
  265. if lines[j] and not price_re.search(lines[j]):
  266. label = lines[j]
  267. break
  268. if not label:
  269. label = f"price_{len(items) + 1}"
  270. if label == "原价":
  271. if items and matches:
  272. try:
  273. items[-1]["price_original"] = float(matches[0])
  274. except Exception:
  275. items[-1]["price_original"] = matches[0]
  276. items[-1].setdefault("note", "")
  277. if items[-1]["note"]:
  278. items[-1]["note"] += "; 原价显示"
  279. else:
  280. items[-1]["note"] = "原价显示"
  281. continue
  282. raw = ln
  283. if re.fullmatch(r"输入|输出", label.strip()):
  284. tier_label = _find_nearest_tier_label(lines, idx)
  285. if tier_label:
  286. label = tier_label
  287. entry: Dict = {"label": label.strip(), "raw": raw}
  288. try:
  289. nums = [float(x) for x in matches]
  290. if len(nums) == 1:
  291. entry["price"] = nums[0]
  292. else:
  293. fnums = sorted(nums)
  294. entry["price_current"] = fnums[0]
  295. entry["price_original"] = fnums[-1]
  296. except Exception:
  297. try:
  298. entry["price"] = float(matches[0])
  299. except Exception:
  300. entry["price"] = matches[0]
  301. unit = None
  302. if re.search(r"每千|每 1k|/千|/每千|tokens", raw, re.I):
  303. unit = "元/每千tokens"
  304. unit_m = re.search(r"元\s*/?\s*每[^\n,,;]*", raw)
  305. if unit_m:
  306. unit = unit_m.group(0)
  307. if unit:
  308. entry["unit"] = unit
  309. note = []
  310. if re.search(r"限时|折", raw):
  311. note.append("限时优惠")
  312. if re.search(r"原价", raw):
  313. note.append("原价显示")
  314. if note:
  315. entry["note"] = "; ".join(note)
  316. entry["currency"] = "CNY"
  317. items.append(entry)
  318. return items
  319. def extract_price_block_html(html: str) -> str:
  320. try:
  321. soup = BeautifulSoup(html, "lxml")
  322. except FeatureNotFound:
  323. soup = BeautifulSoup(html, "html.parser")
  324. # 跳过 script/style 标签内的文本节点
  325. node = None
  326. for n in soup.find_all(string=re.compile(r"模型价格")):
  327. if n.parent and n.parent.name in ("script", "style"):
  328. continue
  329. node = n
  330. break
  331. if not node:
  332. return soup.get_text(separator="\n")
  333. ancestor = node.parent
  334. for _ in range(6):
  335. txt = ancestor.get_text(separator="\n")
  336. if "元" in txt or re.search(r"\d", txt) or "tokens" in txt.lower():
  337. return txt
  338. if ancestor.parent:
  339. ancestor = ancestor.parent
  340. else:
  341. break
  342. return ancestor.get_text(separator="\n")
  343. def extract_price_items_from_html(html: str) -> List[Dict]:
  344. try:
  345. soup = BeautifulSoup(html, "lxml")
  346. except FeatureNotFound:
  347. soup = BeautifulSoup(html, "html.parser")
  348. node = None
  349. for n in soup.find_all(string=re.compile(r"模型价格")):
  350. if n.parent and n.parent.name in ("script", "style"):
  351. continue
  352. node = n
  353. break
  354. if not node:
  355. return []
  356. ancestor = node.parent
  357. container = ancestor
  358. for _ in range(6):
  359. txt = ancestor.get_text(separator="\n")
  360. if "元" in txt or re.search(r"\d", txt) or "tokens" in txt.lower():
  361. container = ancestor
  362. break
  363. if ancestor.parent:
  364. ancestor = ancestor.parent
  365. else:
  366. container = ancestor
  367. break
  368. price_re = re.compile(r"([0-9]+(?:\.[0-9]+)?)\s*元", re.I)
  369. items: List[Dict] = []
  370. container_text = container.get_text(separator="\n")
  371. items = parse_prices_from_text(container_text)
  372. def _postprocess_items(raw_items: List[Dict]) -> List[Dict]:
  373. filtered: List[Dict] = []
  374. for it in raw_items:
  375. raw = it.get("raw", "")
  376. label = it.get("label", "")
  377. unit = it.get("unit", "")
  378. if _is_tool_call_item(label, raw, unit):
  379. continue
  380. if "原价" in label and filtered:
  381. if "price" in it:
  382. filtered[-1]["price_original"] = it["price"]
  383. elif "price_current" in it and "price_original" in it:
  384. filtered[-1]["price_original"] = it["price_original"]
  385. filtered[-1].setdefault("note", "")
  386. if filtered[-1]["note"]:
  387. filtered[-1]["note"] += "; 原价显示"
  388. else:
  389. filtered[-1]["note"] = "原价显示"
  390. continue
  391. notes = []
  392. discount_match = re.search(r"(限时)?([0-9.]+)\s*折", raw)
  393. if discount_match:
  394. discount = discount_match.group(2)
  395. notes.append(f"限时{discount}折")
  396. else:
  397. if re.search(r"限时|免费", raw) or re.search(r"限时|免费", label):
  398. if re.search(r"免费", raw):
  399. notes.append("限时免费")
  400. else:
  401. notes.append("限时优惠")
  402. if re.search(r"原价", raw):
  403. notes.append("原价显示")
  404. if notes:
  405. it["note"] = "; ".join(notes)
  406. if "unit" not in it:
  407. if re.search(r"每千|tokens|/千|/每千", raw, re.I):
  408. it["unit"] = "元/每千tokens"
  409. else:
  410. um = re.search(r"元\s*/?\s*每[^\n,,;]*", raw)
  411. if um:
  412. it["unit"] = um.group(0)
  413. cleaned_label = re.sub(r"限时[0-9.]*折|限时|免费|原价|\s*元.*", "", label).strip()
  414. cleaned_label = re.sub(r"\s+", " ", cleaned_label).strip()
  415. if not cleaned_label:
  416. cleaned_label = "price"
  417. it["label"] = cleaned_label
  418. it["currency"] = "CNY"
  419. filtered.append(it)
  420. return filtered
  421. filtered = _postprocess_items(items)
  422. structured: List[Dict] = []
  423. grouped: Dict[str, Dict[str, Dict]] = {}
  424. for it in filtered:
  425. lbl = it.get("label", "")
  426. raw = it.get("raw", "")
  427. combined = lbl + " " + raw
  428. should_group = False
  429. group = None
  430. if re.search(r"输入", lbl):
  431. should_group = True
  432. group = "input"
  433. elif re.search(r"输出", lbl):
  434. should_group = True
  435. group = "output"
  436. if "tier" in it:
  437. tier_raw = it.get("tier") or ""
  438. tier_key = _normalize_tier_option(tier_raw)
  439. if not group:
  440. if "input" in tier_key.lower():
  441. group = "input"
  442. elif "output" in tier_key.lower():
  443. group = "output"
  444. else:
  445. group = "input"
  446. tier_data = {k: v for k, v in it.items() if k not in ("label", "tier")}
  447. grouped.setdefault(group, {})[tier_key] = tier_data
  448. elif should_group and group:
  449. key = lbl
  450. if group == "input":
  451. key = re.sub(r"^输入", "input", key)
  452. elif group == "output":
  453. key = re.sub(r"^输出", "output", key)
  454. tier_data = {k: v for k, v in it.items() if k not in ("label",)}
  455. grouped.setdefault(group, {})[key] = tier_data
  456. else:
  457. structured.append(it)
  458. for g, mapping in grouped.items():
  459. structured.append({"label": g, "tiers": mapping})
  460. items = structured
  461. if not items:
  462. try:
  463. price_nodes = []
  464. for el in soup.find_all(class_=re.compile(r"price", re.I)):
  465. text = el.get_text(" ", strip=True)
  466. if not re.search(r"[0-9]+(\.[0-9]+)?", text):
  467. continue
  468. price_nodes.append((el, text))
  469. seen = set()
  470. for el, text in price_nodes:
  471. if text in seen:
  472. continue
  473. seen.add(text)
  474. unit_el = el.find_next(class_=re.compile(r"unit", re.I))
  475. unit_text = unit_el.get_text(" ", strip=True) if unit_el else None
  476. label = None
  477. p = el
  478. for _ in range(4):
  479. sib_label = None
  480. parent = p.parent
  481. if parent:
  482. sib_label = parent.find(class_=re.compile(r"label", re.I))
  483. if sib_label and sib_label.get_text(strip=True):
  484. label = sib_label.get_text(" ", strip=True)
  485. break
  486. if parent is None:
  487. break
  488. p = parent
  489. if not label:
  490. prev = el.previous_sibling
  491. steps = 0
  492. while prev and steps < 6:
  493. candidate = None
  494. if isinstance(prev, str) and prev.strip():
  495. candidate = prev.strip()
  496. else:
  497. try:
  498. candidate = prev.get_text(" ", strip=True)
  499. except Exception:
  500. candidate = None
  501. if candidate and not re.search(r"[0-9]", candidate):
  502. label = candidate
  503. break
  504. prev = prev.previous_sibling
  505. steps += 1
  506. entry = {"label": label or "price", "raw": text, "currency": "CNY"}
  507. try:
  508. entry["price"] = float(re.search(r"([0-9]+(?:\.[0-9]+)?)", text).group(1))
  509. except Exception:
  510. entry["price"] = text
  511. if unit_text:
  512. entry["unit"] = unit_text
  513. items.append(entry)
  514. except Exception:
  515. pass
  516. if items:
  517. items = _postprocess_items(items)
  518. return items
  519. def extract_price_items_global(html: str) -> List[Dict]:
  520. try:
  521. soup = BeautifulSoup(html, "lxml")
  522. except FeatureNotFound:
  523. soup = BeautifulSoup(html, "html.parser")
  524. node = None
  525. for n in soup.find_all(string=re.compile(r"模型价格")):
  526. if n.parent and n.parent.name in ("script", "style"):
  527. continue
  528. node = n
  529. break
  530. if not node:
  531. return []
  532. ancestor = node.parent
  533. for _ in range(6):
  534. txt = ancestor.get_text(separator="\n")
  535. if "元" in txt or re.search(r"\d", txt) or "tokens" in txt.lower():
  536. return parse_prices_from_text(txt)
  537. if ancestor.parent:
  538. ancestor = ancestor.parent
  539. else:
  540. break
  541. return parse_prices_from_text(ancestor.get_text(separator="\n"))
  542. def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, executable_path: Optional[str] = None) -> Dict:
  543. result = {"url": url, "error": None, "items": []}
  544. with sync_playwright() as p:
  545. launch_kwargs = {"headless": headless}
  546. if executable_path:
  547. launch_kwargs["executable_path"] = executable_path
  548. extra_args_env = os.environ.get("PLAYWRIGHT_EXTRA_ARGS", "")
  549. extra_args = [a.strip() for a in extra_args_env.split(",") if a.strip()]
  550. if extra_args:
  551. launch_kwargs["args"] = extra_args
  552. browser = p.chromium.launch(**launch_kwargs)
  553. context = browser.new_context()
  554. page = context.new_page()
  555. network_hits = []
  556. console_logs = []
  557. def _on_console(msg):
  558. try:
  559. console_logs.append({"type": msg.type, "text": msg.text})
  560. except Exception:
  561. pass
  562. def _on_response(resp):
  563. try:
  564. url_r = resp.url
  565. ct = resp.headers.get("content-type", "")
  566. if "application/json" in ct or ct.startswith("text") or "json" in url_r.lower() or "price" in url_r.lower():
  567. try:
  568. body = resp.text()
  569. except Exception:
  570. body = None
  571. snippet = None
  572. if body:
  573. if "元" in body or "price" in body.lower() or "tokens" in body.lower() or "price" in url_r.lower():
  574. snippet = body[:2000]
  575. if snippet:
  576. network_hits.append({"url": url_r, "content_type": ct, "snippet": snippet})
  577. except Exception:
  578. pass
  579. page.on("console", _on_console)
  580. page.on("response", _on_response)
  581. try:
  582. page.goto(url, wait_until="networkidle", timeout=timeout)
  583. except PlaywrightTimeoutError:
  584. try:
  585. page.goto(url, wait_until="load", timeout=timeout)
  586. except Exception as e:
  587. result["error"] = f"导航失败: {e}"
  588. browser.close()
  589. return result
  590. try:
  591. page.wait_for_selector("text=模型价格", timeout=8000)
  592. except PlaywrightTimeoutError:
  593. pass
  594. time.sleep(1.2)
  595. html = page.content()
  596. items = []
  597. try:
  598. items = extract_price_items_from_html(html)
  599. except Exception:
  600. items = []
  601. tiered_items: List[Dict] = []
  602. try:
  603. _ensure_tiered_pricing(page)
  604. tier_options = _get_tier_options(page)
  605. for opt in tier_options:
  606. if not _select_tier_option(page, opt):
  607. continue
  608. html = page.content()
  609. try:
  610. tier_items = extract_price_items_from_html(html)
  611. except Exception:
  612. tier_items = []
  613. for it in tier_items:
  614. it["tier"] = opt
  615. tiered_items.extend(tier_items)
  616. except Exception:
  617. tiered_items = []
  618. if tiered_items:
  619. items = tiered_items
  620. if not items:
  621. try:
  622. page.wait_for_selector("text=/[0-9]+(\\.[0-9]+)?\\s*元/", timeout=8000)
  623. except PlaywrightTimeoutError:
  624. pass
  625. try:
  626. page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
  627. time.sleep(1.0)
  628. html = page.content()
  629. items = extract_price_items_from_html(html)
  630. except Exception:
  631. items = []
  632. if not items:
  633. text_block = extract_price_block_html(html)
  634. if not text_block:
  635. result["error"] = "未找到包含 '模型价格' 的区域,可能需要登录或页面结构不同。"
  636. browser.close()
  637. return result
  638. items = parse_prices_from_text(text_block)
  639. def _build_price_map(parsed_items: List[Dict]) -> Dict:
  640. price_map: Dict = {}
  641. for it in parsed_items:
  642. if isinstance(it, dict) and it.get("tiers") and isinstance(it.get("tiers"), dict):
  643. for tier_key, tier_val in it["tiers"].items():
  644. k = _normalize_tier_option(tier_key)
  645. price_map.setdefault(k, {})
  646. sub_label = tier_val.get("label") or tier_val.get("raw") or k
  647. price_map[k][sub_label] = {k2: v for k2, v in tier_val.items() if k2 not in ("tier", "tiers", "label")}
  648. continue
  649. if it.get("tier"):
  650. tk = _normalize_tier_option(it.get("tier"))
  651. price_map.setdefault(tk, {})
  652. sub_label = it.get("label") or it.get("raw") or tk
  653. price_map[tk][sub_label] = {k: v for k, v in it.items() if k not in ("tier", "label")}
  654. continue
  655. lbl = it.get("label") or it.get("raw") or "price"
  656. if lbl in price_map and not isinstance(price_map[lbl], list):
  657. price_map[lbl] = [price_map[lbl]]
  658. if isinstance(price_map.get(lbl), list):
  659. price_map[lbl].append({k: v for k, v in it.items() if k != "label"})
  660. else:
  661. price_map[lbl] = {k: v for k, v in it.items() if k != "label"}
  662. return price_map
  663. price_map = _build_price_map(items)
  664. result = {"url": url, "error": result.get("error"), "prices": price_map}
  665. browser.close()
  666. return result
  667. def main():
  668. ap = argparse.ArgumentParser(description="爬取阿里云模型市场页面的模型价格(基于 Playwright)")
  669. group = ap.add_mutually_exclusive_group(required=True)
  670. group.add_argument("--url", help="单个模型页面 URL")
  671. group.add_argument("--file", help="包含多个 URL(每行一个)的文件路径")
  672. ap.add_argument("--headful", action="store_true", help="以有头模式打开浏览器(方便调试)")
  673. ap.add_argument("--timeout", type=int, default=20000, help="导航超时(毫秒),默认20000")
  674. ap.add_argument("--browser-path", help="浏览器可执行文件完整路径")
  675. args = ap.parse_args()
  676. urls: List[str] = []
  677. if args.url:
  678. urls = [args.url]
  679. else:
  680. with open(args.file, "r", encoding="utf-8") as f:
  681. urls = [ln.strip() for ln in f if ln.strip()]
  682. exec_path = None
  683. if args.browser_path:
  684. exec_path = args.browser_path
  685. else:
  686. exec_path = os.environ.get("PLAYWRIGHT_EXECUTABLE")
  687. headless = not args.headful
  688. if os.environ.get("PLAYWRIGHT_HEADLESS", "").lower() == "false":
  689. headless = False
  690. results = []
  691. for u in urls:
  692. print(f"抓取: {u}")
  693. res = scrape_model_price(u, headless=headless, timeout=args.timeout, executable_path=exec_path)
  694. results.append(res)
  695. print(json.dumps(results, ensure_ascii=False, indent=2))
  696. if __name__ == "__main__":
  697. main()