scrape_aliyun_models.py 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938
  1. #!/usr/bin/env python3
  2. """
  3. Aliyun Model Price Scraper - Final Improved Version
  4. - 使用 Playwright 渲染页面并抓取"模型价格"区域内的价格信息
  5. - 支持单个模型页面 URL,或从文件读取多个 URL
  6. 改进要点:
  7. 1. 能够生成阶梯计费结构:{input: {tier1: {...}, tier2: {...}}, output: {...}}
  8. 2. 优惠标记正确处理:label只保留基础部分,优惠信息放入note字段
  9. 3. 强化过滤:完全排除工具调用价格(包括"千次调用"单位)
  10. 依赖:
  11. pip install playwright beautifulsoup4 lxml
  12. python -m playwright install
  13. 用法示例:
  14. python scrape_aliyun_models.py --url "https://bailian.console.aliyun.com/.../qwen3-max"
  15. python scrape_aliyun_models.py --file urls.txt
  16. 输出: JSON 到 stdout
  17. """
  18. import argparse
  19. import json
  20. import re
  21. import time
  22. import os
  23. from typing import List, Dict, Optional
  24. from bs4 import BeautifulSoup, FeatureNotFound
  25. from bs4.element import Tag
  26. from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
  27. TOOL_CALL_RE = re.compile(
  28. r"调用|工具|接口|api调用|api|次调用|千次调用|/千次|每千次|搜索策略|代码解释|文生图|数据增强|模型推理",
  29. re.I,
  30. )
  31. def _is_tool_call_item(label: str, raw: str, unit: str) -> bool:
  32. label_l = label.lower()
  33. raw_l = raw.lower()
  34. unit_l = unit.lower()
  35. if TOOL_CALL_RE.search(label_l) or TOOL_CALL_RE.search(raw_l) or TOOL_CALL_RE.search(unit_l):
  36. return True
  37. if "千次" in unit_l or "/千" in unit_l or "次调用" in unit_l:
  38. return True
  39. return False
  40. def _find_nearest_tier_label(lines: List[str], idx: int) -> Optional[str]:
  41. tier_re = re.compile(r"(输入|输出).*(<=|>=|<|>|\b\d+\s*k|\d+\s*万|\d+\s*千|\d+\s*tokens?)", re.I)
  42. for step in range(1, 6):
  43. for pos in (idx - step, idx + step):
  44. if pos < 0 or pos >= len(lines):
  45. continue
  46. candidate = lines[pos]
  47. if not candidate or re.search(r"([0-9]+(?:\.[0-9]+)?)\s*元", candidate, re.I):
  48. continue
  49. if tier_re.search(candidate):
  50. return candidate.strip()
  51. return None
  52. def _open_tier_dropdown(page) -> bool:
  53. try:
  54. # 先尝试用 Playwright 原生点击,定位包含"输入"且有 k 范围的 select
  55. try:
  56. # 精准定位:文本包含"输入"和"k"的 select selector
  57. selector = page.locator(".efm_ant-select-selector, .ant-select-selector").filter(has_text=re.compile(r"输入.*\d+\s*[kK]"))
  58. if selector.count() > 0:
  59. selector.first.click(timeout=3000)
  60. time.sleep(0.5)
  61. print("[DEBUG] 原生点击成功")
  62. return True
  63. except Exception as e:
  64. print(f"[DEBUG] 原生点击失败: {e}")
  65. # 回退到 JS 点击
  66. ok = page.evaluate(
  67. """
  68. () => {
  69. const isVisible = (el) => {
  70. if (!el) return false;
  71. const rect = el.getBoundingClientRect();
  72. const style = window.getComputedStyle(el);
  73. return rect.width > 0 && rect.height > 0 && style.display !== 'none' && style.visibility !== 'hidden';
  74. };
  75. const norm = (s) => (s || '').replace(/\s+/g, ' ').trim();
  76. const tierRe = /输入.*\d+\s*[kK]/i;
  77. // 优先找 selector 节点,文本匹配"输入<=32k"之类
  78. let clickEl = null;
  79. const selectors = Array.from(document.querySelectorAll(
  80. ".efm_ant-select-selector, .ant-select-selector"
  81. ));
  82. for (const el of selectors) {
  83. const txt = norm(el.innerText || el.textContent);
  84. if (tierRe.test(txt) && isVisible(el)) {
  85. clickEl = el;
  86. break;
  87. }
  88. }
  89. if (!clickEl) {
  90. // 回退:找包含"输入"的 select 容器
  91. const containers = Array.from(document.querySelectorAll(
  92. ".efm_ant-select, .ant-select"
  93. ));
  94. for (const el of containers) {
  95. const txt = norm(el.innerText || el.textContent);
  96. if (tierRe.test(txt) && isVisible(el)) {
  97. clickEl = el.querySelector(".efm_ant-select-selector, .ant-select-selector") || el;
  98. break;
  99. }
  100. }
  101. }
  102. if (!isVisible(clickEl)) return false;
  103. clickEl.dispatchEvent(new MouseEvent('mousedown', { bubbles: true }));
  104. clickEl.dispatchEvent(new MouseEvent('mouseup', { bubbles: true }));
  105. clickEl.click();
  106. return true;
  107. }
  108. """
  109. )
  110. time.sleep(0.5)
  111. return bool(ok)
  112. except Exception:
  113. return False
  114. def _normalize_tier_option(opt: str) -> str:
  115. """从下拉原始文本中提取并规范化阶梯键,优先返回 input/output 范围键。
  116. 例如:"输入<=32k" -> "input<=32k";"32k<输入<=128k" -> "32k<input<=128k"。
  117. 如果无法识别,返回去掉空白的短文本。
  118. """
  119. if not opt:
  120. return "unknown"
  121. s = opt.replace('\u00a0', ' ')
  122. # 常见模式
  123. m = re.search(r"(\d+\s*k\s*<\s*输入\s*<=\s*\d+\s*k)", s, re.I)
  124. if not m:
  125. m = re.search(r"(输入\s*<=\s*\d+\s*k)", s, re.I)
  126. if not m:
  127. m = re.search(r"(\d+\s*k\s*<\s*输入)", s, re.I)
  128. if m:
  129. key = m.group(1)
  130. key = re.sub(r"\s+", "", key)
  131. key = key.replace("输入", "input").replace("输出", "output")
  132. return key
  133. # 退化策略:找包含输入或输出的数字范围
  134. if "输入" in s or "输出" in s:
  135. nums = re.findall(r"\d+\s*k", s, re.I)
  136. if nums:
  137. joined = "-".join([n.replace(' ', '') for n in nums])
  138. if "输入" in s:
  139. return f"input_{joined}"
  140. return f"output_{joined}"
  141. # 最后回退到简短、安全的键
  142. short = re.sub(r"\s+", " ", s).strip()
  143. return short[:60]
  144. def _get_tier_options(page) -> List[str]:
  145. if not _open_tier_dropdown(page):
  146. print("[DEBUG] 未找到可点击的阶梯计费触发器")
  147. return []
  148. print("[DEBUG] 已展开阶梯计费下拉")
  149. # 打印点击后所有可见容器的 class,帮助定位下拉 portal
  150. try:
  151. containers = page.evaluate(
  152. """
  153. () => {
  154. const isVisible = (el) => {
  155. const r = el.getBoundingClientRect();
  156. const s = window.getComputedStyle(el);
  157. return r.width > 0 && r.height > 0 && s.display !== 'none' && s.visibility !== 'hidden';
  158. };
  159. return Array.from(document.querySelectorAll('div,ul'))
  160. .filter(el => isVisible(el))
  161. .map(el => ({ cls: el.className, childCount: el.children.length,
  162. text: (el.innerText||'').replace(/\\s+/g,' ').trim().slice(0,80) }))
  163. .filter(x => /select|dropdown|popup|overlay|option|list|menu/i.test(x.cls));
  164. }
  165. """
  166. )
  167. for c in containers:
  168. print(f"[CONTAINER] cls={c['cls']!r:.80} children={c['childCount']} text={c['text']!r:.60}")
  169. except Exception as e:
  170. print(f"[DEBUG] 容器诊断失败: {e}")
  171. # 等待下拉容器出现(扩大选择器范围)
  172. dropdown_sel = (
  173. ".efm_ant-select-dropdown, .ant-select-dropdown, "
  174. "[class*='dropdown'], [class*='popup'], [class*='select-list']"
  175. )
  176. try:
  177. page.wait_for_selector(dropdown_sel, state="visible", timeout=3000)
  178. print("[DEBUG] 下拉容器已出现")
  179. except Exception:
  180. print("[DEBUG] 下拉容器未出现,尝试继续")
  181. options = []
  182. # 策略1:从下拉容器内取选项文本
  183. try:
  184. options = page.evaluate(
  185. """
  186. () => {
  187. const isVisible = (el) => {
  188. const r = el.getBoundingClientRect();
  189. const s = window.getComputedStyle(el);
  190. return r.width > 0 && r.height > 0 && s.display !== 'none' && s.visibility !== 'hidden';
  191. };
  192. // 找到展开的下拉容器
  193. const dropdown = Array.from(document.querySelectorAll(
  194. '.efm_ant-select-dropdown, .ant-select-dropdown'
  195. )).find(el => isVisible(el));
  196. if (!dropdown) return [];
  197. // 取容器内所有叶子节点文本
  198. const leaves = Array.from(dropdown.querySelectorAll('*'))
  199. .filter(el => isVisible(el) && el.children.length === 0);
  200. const texts = leaves
  201. .map(el => (el.innerText || el.textContent || '').replace(/\\s+/g, ' ').trim())
  202. .filter(t => t.length > 0 && t.length < 60);
  203. return Array.from(new Set(texts));
  204. }
  205. """
  206. )
  207. print(f"[DEBUG] 下拉容器内文本: {options}")
  208. # 只保留档位选项(含输入+k范围)
  209. options = [t for t in options if re.search(r"输入", t) and re.search(r"\d+\s*[kK]", t)]
  210. except Exception as e:
  211. print(f"[DEBUG] 下拉容器提取失败: {e}")
  212. options = []
  213. # 策略2:宽松兜底——整个页面可见叶子节点,文本含输入+k范围
  214. if not options:
  215. print("[DEBUG] 下拉容器未找到,尝试宽松兜底")
  216. try:
  217. options = page.evaluate(
  218. """
  219. () => {
  220. const isVisible = (el) => {
  221. const r = el.getBoundingClientRect();
  222. const s = window.getComputedStyle(el);
  223. return r.width > 0 && r.height > 0 && s.display !== 'none' && s.visibility !== 'hidden';
  224. };
  225. const texts = Array.from(document.querySelectorAll('*'))
  226. .filter(el => isVisible(el) && el.children.length === 0)
  227. .map(el => (el.innerText || el.textContent || '').replace(/\\s+/g, ' ').trim())
  228. .filter(t => t.length < 60 && /输入/.test(t) && /\\d+\\s*[kK]/.test(t) && /<=|</.test(t));
  229. return Array.from(new Set(texts));
  230. }
  231. """
  232. )
  233. print(f"[DEBUG] 宽松兜底找到: {options}")
  234. except Exception as e:
  235. print(f"[DEBUG] 宽松兜底失败: {e}")
  236. options = []
  237. print(f"[DEBUG] 找到的档位选项: {options}")
  238. try:
  239. page.keyboard.press("Escape")
  240. except Exception:
  241. pass
  242. return list(dict.fromkeys(options))
  243. def _select_tier_option(page, option_text: str) -> bool:
  244. # 每次选择前都重新展开下拉
  245. if not _open_tier_dropdown(page):
  246. print(f"[DEBUG] 选择 {option_text} 失败: 未能展开下拉")
  247. return False
  248. # 等待下拉出现
  249. try:
  250. page.wait_for_selector(
  251. ".efm_ant-select-dropdown, .ant-select-dropdown",
  252. state="visible",
  253. timeout=2000,
  254. )
  255. except Exception:
  256. print(f"[DEBUG] 选择 {option_text} 失败: 下拉未出现")
  257. return False
  258. try:
  259. print(f"[DEBUG] 尝试选择档位: {option_text}")
  260. # 优先用原生点击
  261. try:
  262. option_loc = page.get_by_text(option_text, exact=True).first
  263. option_loc.click(timeout=3000, force=False)
  264. time.sleep(0.6)
  265. print(f"[DEBUG] 成功选择档位: {option_text}")
  266. return True
  267. except Exception as e:
  268. print(f"[DEBUG] 原生点击失败: {e},尝试 JS 点击")
  269. # 回退到 JS 点击
  270. clicked = page.evaluate(
  271. """
  272. (opt) => {
  273. const isVisible = (el) => {
  274. if (!el) return false;
  275. const rect = el.getBoundingClientRect();
  276. const style = window.getComputedStyle(el);
  277. return rect.width > 0 && rect.height > 0 && style.display !== 'none' && style.visibility !== 'hidden';
  278. };
  279. const norm = (s) => (s || '').replace(/\s+/g, ' ').trim();
  280. const nodes = Array.from(document.querySelectorAll(
  281. ".efm_ant-select-item-option-content, [role='option'], .efm_ant-select-item, .ant-select-item"
  282. ));
  283. const target = nodes.find((n) => norm(n.textContent) === opt && isVisible(n));
  284. if (!target) return false;
  285. const clickEl = target.closest(".efm_ant-select-item, [role='option']") || target;
  286. clickEl.dispatchEvent(new MouseEvent('mousedown', { bubbles: true }));
  287. clickEl.dispatchEvent(new MouseEvent('mouseup', { bubbles: true }));
  288. clickEl.click();
  289. return true;
  290. }
  291. """,
  292. option_text,
  293. )
  294. if clicked:
  295. time.sleep(0.6)
  296. print(f"[DEBUG] 成功选择档位: {option_text}")
  297. return True
  298. else:
  299. print(f"[DEBUG] JS 点击也失败")
  300. return False
  301. except Exception as e:
  302. print(f"[DEBUG] 选择档位 {option_text} 失败: {e}")
  303. return False
  304. def _ensure_tiered_pricing(page) -> None:
  305. try:
  306. toggle = page.locator("text=阶梯计费").first
  307. if toggle.count() > 0:
  308. toggle.click()
  309. time.sleep(0.3)
  310. except Exception:
  311. pass
  312. def parse_prices_from_text(text: str) -> List[Dict]:
  313. """从包含"模型价格"的块文本中提取价格项(标签 + 价格)。"""
  314. # 规范化换行,按行分割
  315. lines = [ln.strip() for ln in text.splitlines()]
  316. # 删除空行
  317. lines = [ln for ln in lines if ln]
  318. items = []
  319. # 遍历行,找到包含"元"的行,把它和前面的标签配对;支持行内多个价格(当前价/原价)和阶梯标签
  320. price_re = re.compile(r"([0-9]+(?:\.[0-9]+)?)\s*元", re.I)
  321. for idx, ln in enumerate(lines):
  322. matches = price_re.findall(ln)
  323. if not matches:
  324. continue
  325. # label 优先取价格前的文本片段,否则向上寻找上一非数字行
  326. label = None
  327. # 尝试同一行中价格前的文本(第一个价格)
  328. first_m = price_re.search(ln)
  329. if first_m:
  330. before = ln[: first_m.start()].strip()
  331. if before:
  332. label = before
  333. if not label:
  334. for j in range(idx - 1, -1, -1):
  335. if lines[j] and not price_re.search(lines[j]):
  336. label = lines[j]
  337. break
  338. if not label:
  339. label = f"price_{len(items) + 1}"
  340. # 处理原价行:优先附加到上一条记录
  341. if label == "原价":
  342. if items and matches:
  343. try:
  344. items[-1]["price_original"] = float(matches[0])
  345. except Exception:
  346. items[-1]["price_original"] = matches[0]
  347. items[-1].setdefault("note", "")
  348. if items[-1]["note"]:
  349. items[-1]["note"] += "; 原价显示"
  350. else:
  351. items[-1]["note"] = "原价显示"
  352. continue
  353. raw = ln
  354. # 处理同一行里有多个价格(例如:现价 0.005 元 原价 0.01 元 或 限时 5 折)
  355. # 针对阶梯计费的价格行,优先使用附近的范围标签
  356. if re.fullmatch(r"输入|输出", label.strip()):
  357. tier_label = _find_nearest_tier_label(lines, idx)
  358. if tier_label:
  359. label = tier_label
  360. entry: Dict = {"label": label.strip(), "raw": raw}
  361. try:
  362. nums = [float(x) for x in matches]
  363. if len(nums) == 1:
  364. entry["price"] = nums[0]
  365. else:
  366. # 启发式:较小为现价,较大为原价
  367. fnums = sorted(nums)
  368. entry["price_current"] = fnums[0]
  369. entry["price_original"] = fnums[-1]
  370. except Exception:
  371. # 回退:把第一个当作 price
  372. try:
  373. entry["price"] = float(matches[0])
  374. except Exception:
  375. entry["price"] = matches[0]
  376. # 检测单位与优惠标记
  377. unit = None
  378. if re.search(r"每千|每 1k|/千|/每千|tokens", raw, re.I):
  379. unit = "元/每千tokens"
  380. unit_m = re.search(r"元\s*/?\s*每[^\n,,;]*", raw)
  381. if unit_m:
  382. unit = unit_m.group(0)
  383. if unit:
  384. entry["unit"] = unit
  385. note = []
  386. if re.search(r"限时|折", raw):
  387. note.append("限时优惠")
  388. if re.search(r"原价", raw):
  389. note.append("原价显示")
  390. if note:
  391. entry["note"] = "; ".join(note)
  392. entry["currency"] = "CNY"
  393. items.append(entry)
  394. return items
  395. def extract_price_block_html(html: str) -> str:
  396. """定位包含"模型价格"标题的节点并返回其较大容器的文本(回退为整页文本)。
  397. 如果系统未安装 lxml,回退到内置的 html.parser。
  398. """
  399. try:
  400. soup = BeautifulSoup(html, "lxml")
  401. except FeatureNotFound:
  402. soup = BeautifulSoup(html, "html.parser")
  403. node = soup.find(string=re.compile(r"模型价格"))
  404. if not node:
  405. return soup.get_text(separator="\n")
  406. ancestor = node.parent
  407. for _ in range(6):
  408. txt = ancestor.get_text(separator="\n")
  409. if "元" in txt or re.search(r"\d", txt) or "tokens" in txt.lower():
  410. return txt
  411. if ancestor.parent:
  412. ancestor = ancestor.parent
  413. else:
  414. break
  415. return ancestor.get_text(separator="\n")
  416. def extract_price_items_from_html(html: str) -> List[Dict]:
  417. """尝试从渲染后的 HTML 中结构化提取价格项。返回类似:
  418. [{label, price / price_current & price_original, currency, unit, note, raw}]
  419. 使用启发式规则,适配表格、行、div 等常见结构。
  420. """
  421. try:
  422. soup = BeautifulSoup(html, "lxml")
  423. except FeatureNotFound:
  424. soup = BeautifulSoup(html, "html.parser")
  425. node = soup.find(string=re.compile(r"模型价格"))
  426. if not node:
  427. return []
  428. ancestor = node.parent
  429. container = ancestor
  430. for _ in range(6):
  431. txt = ancestor.get_text(separator="\n")
  432. if "元" in txt or re.search(r"\d", txt) or "tokens" in txt.lower():
  433. container = ancestor
  434. break
  435. if ancestor.parent:
  436. ancestor = ancestor.parent
  437. else:
  438. container = ancestor
  439. break
  440. price_re = re.compile(r"([0-9]+(?:\.[0-9]+)?)\s*元", re.I)
  441. items: List[Dict] = []
  442. # 优先使用容器的逐行文本解析,这样能更好地捕获"输入<=32k: 0.0025 元"之类的阶梯行
  443. container_text = container.get_text(separator="\n")
  444. items = parse_prices_from_text(container_text)
  445. def _postprocess_items(raw_items: List[Dict]) -> List[Dict]:
  446. filtered: List[Dict] = []
  447. for it in raw_items:
  448. raw = it.get("raw", "")
  449. label = it.get("label", "")
  450. unit = it.get("unit", "")
  451. tier = it.get("tier", "")
  452. # 过滤工具调用价格
  453. if _is_tool_call_item(label, raw, unit):
  454. continue
  455. # 原价行:尝试合并到上一条
  456. if "原价" in label and filtered:
  457. if "price" in it:
  458. filtered[-1]["price_original"] = it["price"]
  459. elif "price_current" in it and "price_original" in it:
  460. filtered[-1]["price_original"] = it["price_original"]
  461. filtered[-1].setdefault("note", "")
  462. if filtered[-1]["note"]:
  463. filtered[-1]["note"] += "; 原价显示"
  464. else:
  465. filtered[-1]["note"] = "原价显示"
  466. continue
  467. # 提取优惠信息(限时、折扣)并保存到 note
  468. notes = []
  469. discount_match = re.search(r"(限时)?([0-9.]+)\s*折", raw)
  470. if discount_match:
  471. discount = discount_match.group(2)
  472. notes.append(f"限时{discount}折")
  473. else:
  474. if re.search(r"限时|免费", raw) or re.search(r"限时|免费", label):
  475. if re.search(r"免费", raw):
  476. notes.append("限时免费")
  477. else:
  478. notes.append("限时优惠")
  479. if re.search(r"原价", raw):
  480. notes.append("原价显示")
  481. if notes:
  482. it["note"] = "; ".join(notes)
  483. # 单位探测(若尚未设置)
  484. if "unit" not in it:
  485. if re.search(r"每千|tokens|/千|/每千", raw, re.I):
  486. it["unit"] = "元/每千tokens"
  487. else:
  488. um = re.search(r"元\s*/?\s*每[^\n,,;]*", raw)
  489. if um:
  490. it["unit"] = um.group(0)
  491. # 清理 label:去掉优惠标记、折扣、单位等,只保留基础标签
  492. cleaned_label = re.sub(r"限时[0-9.]*折|限时|免费|原价|\s*元.*", "", label).strip()
  493. cleaned_label = re.sub(r"\s+", " ", cleaned_label).strip()
  494. if not cleaned_label:
  495. cleaned_label = "price"
  496. if tier:
  497. cleaned_label = f"{tier} {cleaned_label}".strip()
  498. it["label"] = cleaned_label
  499. # 统一币种
  500. it["currency"] = "CNY"
  501. filtered.append(it)
  502. return filtered
  503. filtered = _postprocess_items(items)
  504. # 把阶梯计费结构化为按 base(如 input/output)分组的字典
  505. # 结构为 {input: {tier_key: {price, ...}}, output: {...}}
  506. structured: List[Dict] = []
  507. grouped: Dict[str, Dict[str, Dict]] = {}
  508. for it in filtered:
  509. lbl = it.get("label", "")
  510. raw = it.get("raw", "")
  511. combined = lbl + " " + raw
  512. # 判断是否应该分组:如果有"输入"/"输出"关键词就分组
  513. should_group = False
  514. group = None
  515. if re.search(r"输入", lbl):
  516. should_group = True
  517. group = "input"
  518. elif re.search(r"输出", lbl):
  519. should_group = True
  520. group = "output"
  521. # 如果条目来自某个档位切换(有 tier 字段),则优先按该档位分组
  522. if "tier" in it:
  523. tier_raw = it.get("tier") or ""
  524. tier_key = _normalize_tier_option(tier_raw)
  525. # 如果 label 明确为输入/输出,则按 group 分类,否则尝试从 tier_key 推断
  526. if not group:
  527. if "input" in tier_key.lower():
  528. group = "input"
  529. elif "output" in tier_key.lower():
  530. group = "output"
  531. else:
  532. group = "input"
  533. tier_data = {k: v for k, v in it.items() if k not in ("label", "tier")}
  534. grouped.setdefault(group, {})[tier_key] = tier_data
  535. elif should_group and group:
  536. # 使用 label 作为 tier key(回退)
  537. key = lbl
  538. if group == "input":
  539. key = re.sub(r"^输入", "input", key)
  540. elif group == "output":
  541. key = re.sub(r"^输出", "output", key)
  542. tier_data = {k: v for k, v in it.items() if k not in ("label",)}
  543. grouped.setdefault(group, {})[key] = tier_data
  544. else:
  545. structured.append(it)
  546. # 把 grouped 转换为 dict 形式,保留 tiers 字段
  547. for g, mapping in grouped.items():
  548. structured.append({"label": g, "tiers": mapping})
  549. items = structured
  550. # 去重并返回
  551. # 如果没有解析到,尝试基于 class 名进行备选解析(应对字符编码或单位被伪元素渲染的情况)
  552. if not items:
  553. try:
  554. price_nodes = []
  555. # 找到 class 名中带 price 的元素作为价格值
  556. for el in soup.find_all(class_=re.compile(r"price", re.I)):
  557. text = el.get_text(" ", strip=True)
  558. # 跳过非数字文本
  559. if not re.search(r"[0-9]+(\.[0-9]+)?", text):
  560. continue
  561. price_nodes.append((el, text))
  562. seen = set()
  563. for el, text in price_nodes:
  564. if text in seen:
  565. continue
  566. seen.add(text)
  567. # 尝试寻找单位元素
  568. unit_el = el.find_next(class_=re.compile(r"unit", re.I))
  569. unit_text = unit_el.get_text(" ", strip=True) if unit_el else None
  570. # 尝试寻找标签:向上找包含 label 或 pricingLabel 的兄弟/父节点
  571. label = None
  572. p = el
  573. for _ in range(4):
  574. # 检查同级的 label 类
  575. sib_label = None
  576. parent = p.parent
  577. if parent:
  578. sib_label = parent.find(class_=re.compile(r"label", re.I))
  579. if sib_label and sib_label.get_text(strip=True):
  580. label = sib_label.get_text(" ", strip=True)
  581. break
  582. if parent is None:
  583. break
  584. p = parent
  585. if not label:
  586. # 尝试取前一个文本节点
  587. prev = el.previous_sibling
  588. steps = 0
  589. while prev and steps < 6:
  590. candidate = None
  591. if isinstance(prev, str) and prev.strip():
  592. candidate = prev.strip()
  593. else:
  594. try:
  595. candidate = prev.get_text(" ", strip=True)
  596. except Exception:
  597. candidate = None
  598. if candidate and not re.search(r"[0-9]", candidate):
  599. label = candidate
  600. break
  601. prev = prev.previous_sibling
  602. steps += 1
  603. entry = {"label": label or "price", "raw": text, "currency": "CNY"}
  604. try:
  605. entry["price"] = float(re.search(r"([0-9]+(?:\.[0-9]+)?)", text).group(1))
  606. except Exception:
  607. entry["price"] = text
  608. if unit_text:
  609. entry["unit"] = unit_text
  610. items.append(entry)
  611. except Exception:
  612. pass
  613. if items:
  614. items = _postprocess_items(items)
  615. return items
  616. def extract_price_items_global(html: str) -> List[Dict]:
  617. """在整个 HTML 中全局搜索价格字符串,尝试提取周围上下文作为 label。
  618. 这是最后的回退解析,适用于页面结构复杂或文本没包含"模型价格"定位词时。
  619. """
  620. try:
  621. soup = BeautifulSoup(html, "lxml")
  622. except FeatureNotFound:
  623. soup = BeautifulSoup(html, "html.parser")
  624. # 全局回退:仍然优先查找靠近"模型价格"标题的文本
  625. node = soup.find(string=re.compile(r"模型价格"))
  626. if not node:
  627. # 若页面没有"模型价格"关键词,则不尝试全页解析(避免抓到工具调用价格)
  628. return []
  629. ancestor = node.parent
  630. for _ in range(6):
  631. txt = ancestor.get_text(separator="\n")
  632. if "元" in txt or re.search(r"\d", txt) or "tokens" in txt.lower():
  633. return parse_prices_from_text(txt)
  634. if ancestor.parent:
  635. ancestor = ancestor.parent
  636. else:
  637. break
  638. return parse_prices_from_text(ancestor.get_text(separator="\n"))
  639. def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, executable_path: Optional[str] = None) -> Dict:
  640. """用 Playwright 打开页面,等待渲染,然后提取价格信息。"""
  641. result = {"url": url, "error": None, "items": []}
  642. with sync_playwright() as p:
  643. launch_kwargs = {"headless": headless}
  644. if executable_path:
  645. launch_kwargs["executable_path"] = executable_path
  646. browser = p.chromium.launch(**launch_kwargs)
  647. context = browser.new_context()
  648. page = context.new_page()
  649. # 调试收集:网络响应、控制台消息
  650. network_hits = []
  651. console_logs = []
  652. def _on_console(msg):
  653. try:
  654. console_logs.append({"type": msg.type, "text": msg.text})
  655. except Exception:
  656. pass
  657. def _on_response(resp):
  658. try:
  659. url_r = resp.url
  660. ct = resp.headers.get("content-type", "")
  661. # 只尝试读取文本/JSON 响应,避免二进制
  662. if "application/json" in ct or ct.startswith("text") or "json" in url_r.lower() or "price" in url_r.lower():
  663. try:
  664. body = resp.text()
  665. except Exception:
  666. body = None
  667. snippet = None
  668. if body:
  669. if "元" in body or "price" in body.lower() or "tokens" in body.lower() or "price" in url_r.lower():
  670. snippet = body[:2000]
  671. if snippet:
  672. network_hits.append({"url": url_r, "content_type": ct, "snippet": snippet})
  673. except Exception:
  674. pass
  675. page.on("console", _on_console)
  676. page.on("response", _on_response)
  677. try:
  678. page.goto(url, wait_until="networkidle", timeout=timeout)
  679. except PlaywrightTimeoutError:
  680. # 有时 networkidle 很难触发,尝试使用 load 再等待一小会儿
  681. try:
  682. page.goto(url, wait_until="load", timeout=timeout)
  683. except Exception as e:
  684. result["error"] = f"导航失败: {e}"
  685. browser.close()
  686. return result
  687. # 等待页面内文本"模型价格"出现(最多等待 8 秒)
  688. try:
  689. page.wait_for_selector("text=模型价格", timeout=8000)
  690. except PlaywrightTimeoutError:
  691. # 继续也许页面没有精准文本,但尝试抓取页面内容
  692. pass
  693. # 小等待,确保异步渲染完成
  694. time.sleep(1.2)
  695. html = page.content()
  696. # 优先尝试结构化解析 HTML 表格/行,再回退到纯文本解析
  697. items = []
  698. try:
  699. items = extract_price_items_from_html(html)
  700. except Exception:
  701. items = []
  702. # 尝试展开阶梯计费下拉选项,逐档抓取
  703. tiered_items: List[Dict] = []
  704. try:
  705. _ensure_tiered_pricing(page)
  706. tier_options = _get_tier_options(page)
  707. print(f"[DEBUG] 总共找到 {len(tier_options)} 个档位")
  708. for opt in tier_options:
  709. if not _select_tier_option(page, opt):
  710. continue
  711. html = page.content()
  712. try:
  713. tier_items = extract_price_items_from_html(html)
  714. print(f"[DEBUG] 档位 {opt} 解析出 {len(tier_items)} 条价格")
  715. except Exception as e:
  716. print(f"[DEBUG] 档位 {opt} 解析失败: {e}")
  717. tier_items = []
  718. for it in tier_items:
  719. it["tier"] = opt
  720. tiered_items.extend(tier_items)
  721. except Exception as e:
  722. print(f"[DEBUG] 阶梯计费抓取异常: {e}")
  723. tiered_items = []
  724. print(f"[DEBUG] 总共收集 {len(tiered_items)} 条有档位标记的价格")
  725. if tiered_items:
  726. print("[DEBUG] 使用阶梯计费结果,替换普通结果")
  727. items = tiered_items
  728. # 如果没有解析到,尝试等待更通用的价格文本(如"xx 元"),并滚动触发懒加载后重试
  729. if not items:
  730. try:
  731. # 如果页面上没有明确的"模型价格"标题,等待任意包含"元"的价格文本
  732. page.wait_for_selector("text=/[0-9]+(\\.[0-9]+)?\\s*元/", timeout=8000)
  733. except PlaywrightTimeoutError:
  734. pass
  735. # 再次尝试解析(可能因为懒加载需要滚动)
  736. try:
  737. # 尝试滚动到底部并等待渲染
  738. page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
  739. time.sleep(1.0)
  740. html = page.content()
  741. items = extract_price_items_from_html(html)
  742. except Exception:
  743. items = []
  744. # 最后回退到全文本解析
  745. if not items:
  746. text_block = extract_price_block_html(html)
  747. if not text_block:
  748. result["error"] = "未找到包含 '模型价格' 的区域,可能需要登录或页面结构不同。"
  749. browser.close()
  750. return result
  751. items = parse_prices_from_text(text_block)
  752. # 把解析到的 items 转换为仅包含模型价格的简洁结构
  753. def _build_price_map(parsed_items: List[Dict]) -> Dict:
  754. price_map: Dict = {}
  755. # 处理两种情况:1) items 中含有 'tiers'(已按 input/output 分组)
  756. # 2) 每条 item 带有 'tier' 字段(来自逐档抓取)或普通标签
  757. for it in parsed_items:
  758. # 如果已经是按分组结构(label=input/output, tiers=dict)
  759. if isinstance(it, dict) and it.get("tiers") and isinstance(it.get("tiers"), dict):
  760. for tier_key, tier_val in it["tiers"].items():
  761. k = _normalize_tier_option(tier_key)
  762. # 确保内部为 dict,允许同一档位下包含多个子条目
  763. price_map.setdefault(k, {})
  764. # 把 tier_val 放入档位下,以其原始 label 或 raw 作为子键
  765. sub_label = tier_val.get("label") or tier_val.get("raw") or k
  766. price_map[k][sub_label] = {k2: v for k2, v in tier_val.items() if k2 not in ("tier", "tiers", "label")}
  767. continue
  768. # 如果条目本身带有 tier 字段
  769. if it.get("tier"):
  770. tk = _normalize_tier_option(it.get("tier"))
  771. price_map.setdefault(tk, {})
  772. sub_label = it.get("label") or it.get("raw") or tk
  773. price_map[tk][sub_label] = {k: v for k, v in it.items() if k not in ("tier", "label")}
  774. continue
  775. # 普通非阶梯条目:直接以 label 为键
  776. lbl = it.get("label") or it.get("raw") or "price"
  777. # 若 label 已存在且是 dict(多条非阶梯同名),则合并为列表形式
  778. if lbl in price_map and not isinstance(price_map[lbl], list):
  779. price_map[lbl] = [price_map[lbl]]
  780. if isinstance(price_map.get(lbl), list):
  781. price_map[lbl].append({k: v for k, v in it.items() if k != "label"})
  782. else:
  783. price_map[lbl] = {k: v for k, v in it.items() if k != "label"}
  784. return price_map
  785. price_map = _build_price_map(items)
  786. result = {"url": url, "error": result.get("error"), "prices": price_map}
  787. browser.close()
  788. return result
  789. def main():
  790. ap = argparse.ArgumentParser(description="爬取阿里云模型市场页面的模型价格(基于 Playwright)")
  791. group = ap.add_mutually_exclusive_group(required=True)
  792. group.add_argument("--url", help="单个模型页面 URL")
  793. group.add_argument("--file", help="包含多个 URL(每行一个)的文件路径")
  794. ap.add_argument("--headful", action="store_true", help="以有头模式打开浏览器(方便调试)")
  795. ap.add_argument("--timeout", type=int, default=20000, help="导航超时(毫秒),默认20000")
  796. ap.add_argument("--browser-path", help="浏览器可执行文件完整路径(覆盖环境变量 PLAYWRIGHT_EXECUTABLE)")
  797. args = ap.parse_args()
  798. urls: List[str] = []
  799. if args.url:
  800. urls = [args.url]
  801. else:
  802. with open(args.file, "r", encoding="utf-8") as f:
  803. urls = [ln.strip() for ln in f if ln.strip()]
  804. results = []
  805. # 优先使用命令行传入的浏览器可执行路径,其次检查环境变量 PLAYWRIGHT_EXECUTABLE
  806. exec_path = None
  807. if args.browser_path:
  808. exec_path = args.browser_path
  809. else:
  810. exec_path = os.environ.get("PLAYWRIGHT_EXECUTABLE")
  811. # 环境变量 PLAYWRIGHT_HEADLESS=false 可强制有头模式
  812. headless = not args.headful
  813. if os.environ.get("PLAYWRIGHT_HEADLESS", "").lower() == "false":
  814. headless = False
  815. for u in urls:
  816. print(f"抓取: {u}")
  817. res = scrape_model_price(u, headless=headless, timeout=args.timeout, executable_path=exec_path)
  818. results.append(res)
  819. print(json.dumps(results, ensure_ascii=False, indent=2))
  820. if __name__ == "__main__":
  821. main()