scrape_tool_prices.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. #!/usr/bin/env python3
  2. """
  3. scrape_tool_prices.py
  4. 抓取阿里云百炼模型页面的工具调用价格:
  5. - 搜索策略、代码解释器、文生图等工具的调用费用
  6. - 单位通常为 元/千次调用
  7. 原理:复用 scrape_aliyun_models.py 的页面渲染逻辑,
  8. 但专门提取工具调用相关价格行(原脚本会过滤掉这些)。
  9. """
  10. import re
  11. import time
  12. import json
  13. from typing import Dict, List, Optional
  14. from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
  15. # 工具调用价格识别规则
  16. TOOL_CALL_RE = re.compile(
  17. r"搜索策略|代码解释|文生图|数据增强|模型推理|工具调用|千次调用|/千次|次调用",
  18. re.I,
  19. )
  20. # 单位识别
  21. TOOL_UNIT_RE = re.compile(r"千次调用|/千次|次调用", re.I)
  22. def _is_tool_call_item(label: str, raw: str) -> bool:
  23. return bool(TOOL_CALL_RE.search(label) or TOOL_CALL_RE.search(raw))
  24. def parse_tool_prices_from_text(text: str) -> List[Dict]:
  25. """
  26. 从"工具调用价格"区块文本中提取工具调用价格条目。
  27. 文本是一整行,格式:
  28. 工具名Completions API价格信息工具名Responses API价格信息...
  29. """
  30. items: List[Dict] = []
  31. seen: set = set()
  32. price_re = re.compile(r"([0-9]+(?:\.[0-9]+)?)")
  33. free_re = re.compile(r"限时免费|免费")
  34. # 用 API 类型作为分隔符,切成 [工具名+价格, API类型, 工具名+价格, API类型, ...]
  35. api_sep_re = re.compile(r"(Completions API|Responses API)")
  36. parts = api_sep_re.split(text)
  37. # parts 结构: ["工具调用价格tool1", "Completions API", "价格1tool2", "Responses API", "价格2tool3", ...]
  38. # 每个条目 = parts[n](工具名在末尾) + parts[n+1](API类型,丢弃) + parts[n+2](价格在开头)
  39. # 工具名在前一段的末尾,价格在后一段的开头
  40. tool_re = re.compile(r"([a-zA-Z][a-zA-Z0-9_:]*)$") # 段末尾的工具名
  41. for i in range(0, len(parts) - 1, 2):
  42. before = parts[i] # 包含工具名(在末尾)
  43. # parts[i+1] 是 API 类型,跳过
  44. after = parts[i + 2] if i + 2 < len(parts) else "" # 包含价格(在开头)
  45. # 从 before 末尾提取工具名
  46. m = tool_re.search(before)
  47. if not m:
  48. continue
  49. label = m.group(1)
  50. if label in seen:
  51. continue
  52. # 从 after 开头提取价格信息(到下一个工具名开始前)
  53. next_tool_m = tool_re.search(after)
  54. price_info = after[: next_tool_m.start()].strip() if next_tool_m else after.strip()
  55. entry: Dict = {"label": label, "currency": "CNY", "unit": "元/千次调用"}
  56. if free_re.search(price_info):
  57. entry["price"] = 0
  58. entry["note"] = "限时免费"
  59. else:
  60. nums = price_re.findall(price_info)
  61. if not nums:
  62. continue
  63. try:
  64. entry["price"] = float(nums[0])
  65. except Exception:
  66. entry["price"] = nums[0]
  67. if re.search(r"限时优惠", price_info):
  68. entry["note"] = "限时优惠"
  69. dm = re.search(r"([0-9.]+)\s*折", price_info)
  70. if dm:
  71. entry["note"] = f"限时{dm.group(1)}折"
  72. seen.add(label)
  73. items.append(entry)
  74. return items
  75. price_re = re.compile(r"([0-9]+(?:\.[0-9]+)?)")
  76. free_re = re.compile(r"限时免费|免费")
  77. for m in pattern.finditer(text):
  78. label = m.group(1).strip()
  79. price_info = m.group(2).strip()
  80. if not label or label in seen:
  81. continue
  82. entry: Dict = {"label": label, "currency": "CNY", "unit": "元/千次调用"}
  83. if free_re.search(price_info):
  84. entry["price"] = 0
  85. entry["note"] = "限时免费"
  86. else:
  87. nums = price_re.findall(price_info)
  88. if not nums:
  89. continue
  90. try:
  91. entry["price"] = float(nums[0])
  92. except Exception:
  93. entry["price"] = nums[0]
  94. if re.search(r"限时优惠", price_info):
  95. entry["note"] = "限时优惠"
  96. dm = re.search(r"([0-9.]+)\s*折", price_info)
  97. if dm:
  98. entry["note"] = f"限时{dm.group(1)}折"
  99. seen.add(label)
  100. items.append(entry)
  101. return items
  102. # 用 API 类型标注作为分隔符切割整段文本
  103. api_sep_re = re.compile(r"(Completions API|Responses API)")
  104. price_re = re.compile(r"([0-9]+(?:\.[0-9]+)?)\s*元")
  105. free_re = re.compile(r"限时免费|免费")
  106. # 先去掉标题
  107. text = re.sub(r"^工具调用价格", "", text.strip())
  108. # 按 API 类型切割:得到 [工具名, API类型, 价格信息, 工具名, API类型, 价格信息, ...]
  109. parts = api_sep_re.split(text)
  110. # parts 结构:[工具名1, "Completions API", 价格1+工具名2, "Responses API", 价格2+工具名3, ...]
  111. items: List[Dict] = []
  112. seen: set = set()
  113. i = 0
  114. while i < len(parts):
  115. segment = parts[i].strip()
  116. # 跳过 API 类型标注本身
  117. if api_sep_re.fullmatch(segment):
  118. i += 1
  119. continue
  120. # 这段包含:上一条目的价格信息 + 下一条目的工具名
  121. # 需要从末尾提取工具名(工具名是纯英文+冒号/下划线,不含中文和数字价格)
  122. # 工具名模式:由字母、数字、下划线、冒号组成
  123. tool_name_re = re.compile(r"([a-zA-Z][a-zA-Z0-9_:]*(?:\.[a-zA-Z0-9_:]+)*)$")
  124. # 先提取末尾的工具名(留给下一轮用)
  125. next_tool = ""
  126. m = tool_name_re.search(segment)
  127. if m:
  128. next_tool = m.group(1)
  129. price_part = segment[: m.start()].strip()
  130. else:
  131. price_part = segment
  132. # 如果有上一个工具名等待配对价格
  133. if i > 0:
  134. # 找上一个工具名
  135. prev_tool = ""
  136. prev_seg = parts[i - 2].strip() if i >= 2 else ""
  137. tm = tool_name_re.search(prev_seg)
  138. if tm:
  139. prev_tool = tm.group(1)
  140. elif i == 1:
  141. # 第一段就是工具名
  142. prev_tool = parts[0].strip()
  143. if prev_tool and prev_tool not in seen:
  144. entry: Dict = {"label": prev_tool, "currency": "CNY", "unit": "元/千次调用"}
  145. if free_re.search(price_part) and not price_re.search(price_part):
  146. entry["price"] = 0
  147. entry["note"] = "限时免费"
  148. else:
  149. nums = price_re.findall(price_part)
  150. if nums:
  151. try:
  152. entry["price"] = float(nums[0])
  153. except Exception:
  154. entry["price"] = nums[0]
  155. if re.search(r"限时优惠", price_part):
  156. entry["note"] = "限时优惠"
  157. dm = re.search(r"([0-9.]+)\s*折", price_part)
  158. if dm:
  159. entry["note"] = f"限时{dm.group(1)}折"
  160. else:
  161. i += 1
  162. continue
  163. seen.add(prev_tool)
  164. items.append(entry)
  165. i += 1
  166. # 处理最后一个工具(最后一段没有后续 API 标注)
  167. if parts:
  168. last_seg = parts[-1].strip()
  169. # 如果最后一段不是 API 类型,且含价格或免费信息
  170. if not api_sep_re.fullmatch(last_seg):
  171. # 找最后一个工具名(倒数第二个 API 标注之后的工具名)
  172. # 已在循环中处理,这里处理最后一段的价格+工具名情况
  173. tool_name_re = re.compile(r"([a-zA-Z][a-zA-Z0-9_:]*(?:\.[a-zA-Z0-9_:]+)*)$")
  174. m = tool_name_re.search(last_seg)
  175. if m:
  176. last_tool = m.group(1)
  177. last_price_part = last_seg[: m.start()].strip()
  178. if last_tool not in seen and (free_re.search(last_price_part) or price_re.search(last_price_part)):
  179. entry = {"label": last_tool, "currency": "CNY", "unit": "元/千次调用"}
  180. if free_re.search(last_price_part) and not price_re.search(last_price_part):
  181. entry["price"] = 0
  182. entry["note"] = "限时免费"
  183. else:
  184. nums = price_re.findall(last_price_part)
  185. if nums:
  186. try:
  187. entry["price"] = float(nums[0])
  188. except Exception:
  189. entry["price"] = nums[0]
  190. if re.search(r"限时优惠", last_price_part):
  191. entry["note"] = "限时优惠"
  192. seen.add(last_tool)
  193. items.append(entry)
  194. return items
  195. def _get_tool_price_section_text(html: str) -> str:
  196. """
  197. 专门定位"工具调用价格"区块文本,排除 script/style。
  198. 工具调用价格是独立区块,标题为"工具调用价格",不在"模型价格"区块内。
  199. """
  200. try:
  201. from bs4 import BeautifulSoup, FeatureNotFound
  202. try:
  203. soup = BeautifulSoup(html, "lxml")
  204. except FeatureNotFound:
  205. soup = BeautifulSoup(html, "html.parser")
  206. # 优先找"工具调用价格"标题节点
  207. target_node = None
  208. for node in soup.find_all(string=re.compile(r"工具调用价格")):
  209. if node.parent and node.parent.name in ("script", "style"):
  210. continue
  211. target_node = node
  212. break
  213. if not target_node:
  214. return ""
  215. # 向上找包含价格数字的容器
  216. ancestor = target_node.parent
  217. for _ in range(10):
  218. txt = ancestor.get_text(separator="\n")
  219. if ("元" in txt or "免费" in txt) and len(txt) > 50:
  220. return txt
  221. if ancestor.parent:
  222. ancestor = ancestor.parent
  223. else:
  224. break
  225. return ancestor.get_text(separator="\n")
  226. except Exception:
  227. return ""
  228. def scrape_tool_prices_standalone(
  229. url: str,
  230. headless: bool = True,
  231. timeout: int = 20000,
  232. executable_path: Optional[str] = None,
  233. ) -> Dict:
  234. """
  235. 独立运行:启动浏览器,导航,抓取工具调用价格后关闭。
  236. 返回:
  237. {
  238. "url": str,
  239. "error": str | None,
  240. "tool_call_prices": [
  241. {"label": "搜索策略", "price": 0.5, "unit": "元/千次调用", "currency": "CNY"},
  242. ...
  243. ]
  244. }
  245. """
  246. from playwright.sync_api import sync_playwright
  247. result: Dict = {"url": url, "error": None, "tool_call_prices": []}
  248. with sync_playwright() as p:
  249. launch_kwargs: Dict = {"headless": headless}
  250. if executable_path:
  251. launch_kwargs["executable_path"] = executable_path
  252. browser = p.chromium.launch(**launch_kwargs)
  253. page = browser.new_context().new_page()
  254. try:
  255. page.goto(url, wait_until="networkidle", timeout=timeout)
  256. except PlaywrightTimeoutError:
  257. try:
  258. page.goto(url, wait_until="load", timeout=timeout)
  259. except Exception as e:
  260. result["error"] = f"导航失败: {e}"
  261. browser.close()
  262. return result
  263. try:
  264. page.wait_for_selector("text=模型价格", timeout=8000)
  265. except PlaywrightTimeoutError:
  266. pass
  267. time.sleep(1.2)
  268. html = page.content()
  269. price_text = _get_tool_price_section_text(html)
  270. if not price_text:
  271. # 尝试滚动后重试
  272. try:
  273. page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
  274. time.sleep(1.5)
  275. html = page.content()
  276. price_text = _get_tool_price_section_text(html)
  277. except Exception:
  278. pass
  279. if not price_text:
  280. result["error"] = "未找到工具调用价格区域"
  281. browser.close()
  282. return result
  283. print(f"[DEBUG] 工具调用价格区域文本:\n{price_text[:300]}")
  284. result["tool_call_prices"] = parse_tool_prices_from_text(price_text)
  285. browser.close()
  286. return result
  287. if __name__ == "__main__":
  288. import argparse, os
  289. ap = argparse.ArgumentParser(description="抓取阿里云模型工具调用价格")
  290. group = ap.add_mutually_exclusive_group(required=True)
  291. group.add_argument("--url")
  292. group.add_argument("--file")
  293. ap.add_argument("--headful", action="store_true")
  294. ap.add_argument("--timeout", type=int, default=20000)
  295. ap.add_argument("--browser-path")
  296. args = ap.parse_args()
  297. urls = [args.url] if args.url else open(args.file, encoding="utf-8").read().splitlines()
  298. urls = [u.strip() for u in urls if u.strip()]
  299. exec_path = args.browser_path or os.environ.get("PLAYWRIGHT_EXECUTABLE")
  300. headless = not args.headful
  301. results = []
  302. for u in urls:
  303. print(f"抓取工具调用价格: {u}", flush=True)
  304. results.append(scrape_tool_prices_standalone(u, headless=headless, timeout=args.timeout, executable_path=exec_path))
  305. print(json.dumps(results, ensure_ascii=False, indent=2))