|
@@ -51,7 +51,7 @@ from scrape_model_icon import _extract_icon_from_page
|
|
|
def _navigate(page, url: str, timeout: int) -> bool:
|
|
def _navigate(page, url: str, timeout: int) -> bool:
|
|
|
"""导航到 URL,返回是否成功。"""
|
|
"""导航到 URL,返回是否成功。"""
|
|
|
try:
|
|
try:
|
|
|
- page.goto(url, wait_until="networkidle", timeout=timeout)
|
|
|
|
|
|
|
+ page.goto(url, wait_until="domcontentloaded", timeout=timeout)
|
|
|
return True
|
|
return True
|
|
|
except PlaywrightTimeoutError:
|
|
except PlaywrightTimeoutError:
|
|
|
try:
|
|
try:
|
|
@@ -64,7 +64,7 @@ def _navigate(page, url: str, timeout: int) -> bool:
|
|
|
|
|
|
|
|
def _wait_for_content(page) -> None:
|
|
def _wait_for_content(page) -> None:
|
|
|
"""等待页面核心内容渲染完成。"""
|
|
"""等待页面核心内容渲染完成。"""
|
|
|
- for sel in ["text=模型价格", "text=模型介绍", "text=模型能力"]:
|
|
|
|
|
|
|
+ for sel in ["text=模型价格", "text=模型介绍", "text=模型能力", "text=模型限流"]:
|
|
|
try:
|
|
try:
|
|
|
page.wait_for_selector(sel, timeout=6000)
|
|
page.wait_for_selector(sel, timeout=6000)
|
|
|
break
|
|
break
|
|
@@ -81,6 +81,27 @@ def _wait_for_content(page) -> None:
|
|
|
pass
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def _parse_cookies_env(cookies_str: str, domain: str = ".aliyun.com") -> List[Dict]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 将 Cookie 字符串(浏览器复制的 name=value; name2=value2 格式)
|
|
|
|
|
+ 解析为 Playwright set_cookies 所需的列表格式。
|
|
|
|
|
+ """
|
|
|
|
|
+ cookies = []
|
|
|
|
|
+ for part in cookies_str.split(";"):
|
|
|
|
|
+ part = part.strip()
|
|
|
|
|
+ if not part:
|
|
|
|
|
+ continue
|
|
|
|
|
+ if "=" in part:
|
|
|
|
|
+ name, _, value = part.partition("=")
|
|
|
|
|
+ cookies.append({
|
|
|
|
|
+ "name": name.strip(),
|
|
|
|
|
+ "value": value.strip(),
|
|
|
|
|
+ "domain": domain,
|
|
|
|
|
+ "path": "/",
|
|
|
|
|
+ })
|
|
|
|
|
+ return cookies
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def scrape_all(
|
|
def scrape_all(
|
|
|
url: str,
|
|
url: str,
|
|
|
headless: bool = True,
|
|
headless: bool = True,
|
|
@@ -109,6 +130,9 @@ def scrape_all(
|
|
|
# 其余模块共享一个浏览器实例
|
|
# 其余模块共享一个浏览器实例
|
|
|
shared_modules = [m for m in modules if m != "price"]
|
|
shared_modules = [m for m in modules if m != "price"]
|
|
|
|
|
|
|
|
|
|
+ # 从环境变量读取登录 Cookie
|
|
|
|
|
+ aliyun_cookies_str = os.environ.get("ALIYUN_COOKIES", "").strip()
|
|
|
|
|
+
|
|
|
# ── 共享浏览器:info / rate / tool ──────────────────────────────────────────
|
|
# ── 共享浏览器:info / rate / tool ──────────────────────────────────────────
|
|
|
if shared_modules:
|
|
if shared_modules:
|
|
|
api_data: List[Dict] = []
|
|
api_data: List[Dict] = []
|
|
@@ -130,28 +154,51 @@ def scrape_all(
|
|
|
context_kwargs: Dict = {}
|
|
context_kwargs: Dict = {}
|
|
|
if api_key:
|
|
if api_key:
|
|
|
context_kwargs["extra_http_headers"] = {"Authorization": f"Bearer {api_key}"}
|
|
context_kwargs["extra_http_headers"] = {"Authorization": f"Bearer {api_key}"}
|
|
|
- page = browser.new_context(**context_kwargs).new_page()
|
|
|
|
|
|
|
+ context = browser.new_context(**context_kwargs)
|
|
|
|
|
|
|
|
- # 拦截 API 响应
|
|
|
|
|
- def on_response(resp):
|
|
|
|
|
|
|
+ # 注入登录 Cookie(避免被重定向到登录/免费试用页)
|
|
|
|
|
+ if aliyun_cookies_str:
|
|
|
|
|
+ cookies = _parse_cookies_env(aliyun_cookies_str)
|
|
|
|
|
+ if cookies:
|
|
|
|
|
+ context.add_cookies(cookies)
|
|
|
|
|
+ print(f"[INFO] 已注入 {len(cookies)} 个 Cookie")
|
|
|
|
|
+
|
|
|
|
|
+ page = context.new_page()
|
|
|
|
|
+
|
|
|
|
|
+ # 只拦截匹配 INFO_API_RE 的 JSON API 请求,其余直接放行
|
|
|
|
|
+ # 避免对图片/日志等请求调用 route.fetch() 导致 DNS 失败崩溃
|
|
|
|
|
+ import json as _json
|
|
|
|
|
+
|
|
|
|
|
+ def handle_api_route(route, request):
|
|
|
try:
|
|
try:
|
|
|
- if "application/json" not in resp.headers.get("content-type", ""):
|
|
|
|
|
- return
|
|
|
|
|
- if not INFO_API_RE.search(resp.url):
|
|
|
|
|
- return
|
|
|
|
|
|
|
+ resp = route.fetch()
|
|
|
try:
|
|
try:
|
|
|
- api_data.append(resp.json())
|
|
|
|
|
|
|
+ ct = resp.headers.get("content-type", "")
|
|
|
|
|
+ if "application/json" in ct:
|
|
|
|
|
+ api_data.append(_json.loads(resp.body()))
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ pass
|
|
|
|
|
+ route.fulfill(response=resp)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ try:
|
|
|
|
|
+ route.continue_()
|
|
|
except Exception:
|
|
except Exception:
|
|
|
pass
|
|
pass
|
|
|
- except Exception:
|
|
|
|
|
- pass
|
|
|
|
|
|
|
|
|
|
- page.on("response", on_response)
|
|
|
|
|
|
|
+ # 只对匹配 API 的 URL 注册拦截,其余请求不拦截(直接走浏览器默认行为)
|
|
|
|
|
+ page.route(
|
|
|
|
|
+ lambda url: bool(INFO_API_RE.search(url)),
|
|
|
|
|
+ handle_api_route,
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
if not _navigate(page, url, timeout):
|
|
if not _navigate(page, url, timeout):
|
|
|
result["error"] = "导航失败"
|
|
result["error"] = "导航失败"
|
|
|
browser.close()
|
|
browser.close()
|
|
|
else:
|
|
else:
|
|
|
|
|
+ try:
|
|
|
|
|
+ page.wait_for_load_state("networkidle", timeout=20000)
|
|
|
|
|
+ except PlaywrightTimeoutError:
|
|
|
|
|
+ pass
|
|
|
_wait_for_content(page)
|
|
_wait_for_content(page)
|
|
|
|
|
|
|
|
# 从 API 找模型对象
|
|
# 从 API 找模型对象
|
|
@@ -160,7 +207,6 @@ def scrape_all(
|
|
|
found = _find_model_in_json(body, target)
|
|
found = _find_model_in_json(body, target)
|
|
|
if found:
|
|
if found:
|
|
|
model_obj = found
|
|
model_obj = found
|
|
|
- print(f"[INFO] API 找到模型: {found.get('model', found.get('name', target))}")
|
|
|
|
|
break
|
|
break
|
|
|
|
|
|
|
|
if not model_obj:
|
|
if not model_obj:
|
|
@@ -189,21 +235,101 @@ def scrape_all(
|
|
|
icon = _extract_icon_from_page(page)
|
|
icon = _extract_icon_from_page(page)
|
|
|
result["icon"] = icon.get("data") if icon.get("type") != "none" else None
|
|
result["icon"] = icon.get("data") if icon.get("type") != "none" else None
|
|
|
|
|
|
|
|
|
|
+ # ── price 模块(复用共享浏览器) ──
|
|
|
|
|
+ if "price" in modules:
|
|
|
|
|
+ try:
|
|
|
|
|
+ from scrape_aliyun_models import (
|
|
|
|
|
+ extract_price_items_from_html,
|
|
|
|
|
+ extract_price_block_html,
|
|
|
|
|
+ parse_prices_from_text,
|
|
|
|
|
+ _ensure_tiered_pricing,
|
|
|
|
|
+ _get_tier_options,
|
|
|
|
|
+ _select_tier_option,
|
|
|
|
|
+ _normalize_tier_option,
|
|
|
|
|
+ )
|
|
|
|
|
+ import time as _time
|
|
|
|
|
+
|
|
|
|
|
+ _ensure_tiered_pricing(page)
|
|
|
|
|
+ tier_options = _get_tier_options(page)
|
|
|
|
|
+ tiered_items = []
|
|
|
|
|
+ if tier_options:
|
|
|
|
|
+ for opt in tier_options:
|
|
|
|
|
+ if not _select_tier_option(page, opt):
|
|
|
|
|
+ continue
|
|
|
|
|
+ html = page.content()
|
|
|
|
|
+ try:
|
|
|
|
|
+ tier_items = extract_price_items_from_html(html)
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ tier_items = []
|
|
|
|
|
+ for it in tier_items:
|
|
|
|
|
+ it["tier"] = opt
|
|
|
|
|
+ tiered_items.extend(tier_items)
|
|
|
|
|
+
|
|
|
|
|
+ if tiered_items:
|
|
|
|
|
+ items = tiered_items
|
|
|
|
|
+ else:
|
|
|
|
|
+ html = page.content()
|
|
|
|
|
+ items = extract_price_items_from_html(html)
|
|
|
|
|
+ if not items:
|
|
|
|
|
+ text_block = extract_price_block_html(html)
|
|
|
|
|
+ items = parse_prices_from_text(text_block) if text_block else []
|
|
|
|
|
+
|
|
|
|
|
+ # 构建 price_map(复用 scrape_model_price 里的逻辑)
|
|
|
|
|
+ def _build_price_map(parsed_items):
|
|
|
|
|
+ price_map = {}
|
|
|
|
|
+ for it in parsed_items:
|
|
|
|
|
+ if isinstance(it, dict) and it.get("tiers") and isinstance(it.get("tiers"), dict):
|
|
|
|
|
+ for tier_key, tier_val in it["tiers"].items():
|
|
|
|
|
+ k = _normalize_tier_option(tier_key)
|
|
|
|
|
+ price_map.setdefault(k, {})
|
|
|
|
|
+ sub_label = tier_val.get("label") or tier_val.get("raw") or k
|
|
|
|
|
+ price_map[k][sub_label] = {kk: v for kk, v in tier_val.items() if kk not in ("tier", "tiers", "label")}
|
|
|
|
|
+ continue
|
|
|
|
|
+ if it.get("tier"):
|
|
|
|
|
+ tk = _normalize_tier_option(it.get("tier"))
|
|
|
|
|
+ price_map.setdefault(tk, {})
|
|
|
|
|
+ sub_label = it.get("label") or it.get("raw") or tk
|
|
|
|
|
+ price_map[tk][sub_label] = {kk: v for kk, v in it.items() if kk not in ("tier", "label")}
|
|
|
|
|
+ continue
|
|
|
|
|
+ lbl = it.get("label") or it.get("raw") or "price"
|
|
|
|
|
+ if lbl in price_map and not isinstance(price_map[lbl], list):
|
|
|
|
|
+ price_map[lbl] = [price_map[lbl]]
|
|
|
|
|
+ if isinstance(price_map.get(lbl), list):
|
|
|
|
|
+ price_map[lbl].append({kk: v for kk, v in it.items() if kk != "label"})
|
|
|
|
|
+ else:
|
|
|
|
|
+ price_map[lbl] = {kk: v for kk, v in it.items() if kk != "label"}
|
|
|
|
|
+ return price_map
|
|
|
|
|
+
|
|
|
|
|
+ result["prices"] = _build_price_map(items)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ import traceback as _tb
|
|
|
|
|
+ print(f"[ERROR] 价格模块异常: {e}\n{_tb.format_exc()}")
|
|
|
|
|
+ result["prices"] = {}
|
|
|
|
|
+ result["price_error"] = str(e)
|
|
|
|
|
+
|
|
|
browser.close()
|
|
browser.close()
|
|
|
|
|
|
|
|
- # ── price 模块(原始脚本,独立浏览器) ──────────────────────────────────────
|
|
|
|
|
- if "price" in modules:
|
|
|
|
|
- print(f"[INFO] 运行价格模块...")
|
|
|
|
|
- price_result = scrape_model_price(
|
|
|
|
|
- url,
|
|
|
|
|
- headless=headless,
|
|
|
|
|
- timeout=timeout,
|
|
|
|
|
- executable_path=executable_path,
|
|
|
|
|
- api_key=api_key,
|
|
|
|
|
- )
|
|
|
|
|
- result["prices"] = price_result.get("prices", {})
|
|
|
|
|
- if price_result.get("error"):
|
|
|
|
|
- result["price_error"] = price_result["error"]
|
|
|
|
|
|
|
+ # ── price 模块回退:若 shared_modules 为空(不含 info/rate/tool),独立启动浏览器 ──
|
|
|
|
|
+ if "price" in modules and not shared_modules:
|
|
|
|
|
+ print(f"[INFO] 运行价格模块(独立浏览器)...")
|
|
|
|
|
+ try:
|
|
|
|
|
+ price_result = scrape_model_price(
|
|
|
|
|
+ url,
|
|
|
|
|
+ headless=headless,
|
|
|
|
|
+ timeout=timeout,
|
|
|
|
|
+ executable_path=executable_path,
|
|
|
|
|
+ api_key=api_key,
|
|
|
|
|
+ cookies_str=aliyun_cookies_str,
|
|
|
|
|
+ )
|
|
|
|
|
+ result["prices"] = price_result.get("prices", {})
|
|
|
|
|
+ if price_result.get("error"):
|
|
|
|
|
+ result["price_error"] = price_result["error"]
|
|
|
|
|
+ print(f"[WARN] 价格模块错误: {price_result['error']}")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ import traceback as _tb
|
|
|
|
|
+ print(f"[ERROR] 价格模块异常: {e}\n{_tb.format_exc()}")
|
|
|
|
|
+ result["prices"] = {}
|
|
|
|
|
+ result["price_error"] = str(e)
|
|
|
|
|
|
|
|
return result
|
|
return result
|
|
|
|
|
|