Просмотр исходного кода

更改爬虫逻辑,阿里页面发生变化,需更改爬虫逻辑

lxylxy123321 1 месяц назад
Родитель
Сommit
4f2fb38a9e

+ 2 - 1
backend/.env

@@ -9,10 +9,11 @@ DB_NAME=crawl_test
 
 ALLOWED_ORIGINS=http://localhost:5173
 # ALLOWED_ORIGINS=https://crawler.aitoolcore.com
+# ALLOWED_ORIGINS=http://8.137.98.23:5173
 GEOIP_DB_PATH=./GeoLite2-City.mmdb
 #本地
 PLAYWRIGHT_EXECUTABLE=D:\playwright-browsers\chromium-1208\chrome-win64\chrome.exe
-# #生产
+#生产
 # PLAYWRIGHT_EXECUTABLE=/www/wwwroot/playwright/chromium-1045/chrome-linux/chrome
 PLAYWRIGHT_HEADLESS=true
 

+ 7 - 0
backend/app/services/scraper.py

@@ -21,6 +21,13 @@ for _p in (_backend_root, _crawl_dir):
 
 from main import scrape_all  # noqa: E402  (backend/crawl/main.py)
 
+# 启动时打印当前 INFO_API_RE,用于确认加载的是最新代码
+import main as _crawl_main
+import logging as _logging
+_logging.getLogger(__name__).info(
+    f"[scraper] INFO_API_RE = {_crawl_main.INFO_API_RE.pattern[:60]}..."
+)
+
 
 class ScraperService:
     """Manages the lifecycle of a scrape job."""

+ 153 - 27
backend/crawl/main.py

@@ -51,7 +51,7 @@ from scrape_model_icon import _extract_icon_from_page
 def _navigate(page, url: str, timeout: int) -> bool:
     """导航到 URL,返回是否成功。"""
     try:
-        page.goto(url, wait_until="networkidle", timeout=timeout)
+        page.goto(url, wait_until="domcontentloaded", timeout=timeout)
         return True
     except PlaywrightTimeoutError:
         try:
@@ -64,7 +64,7 @@ def _navigate(page, url: str, timeout: int) -> bool:
 
 def _wait_for_content(page) -> None:
     """等待页面核心内容渲染完成。"""
-    for sel in ["text=模型价格", "text=模型介绍", "text=模型能力"]:
+    for sel in ["text=模型价格", "text=模型介绍", "text=模型能力", "text=模型限流"]:
         try:
             page.wait_for_selector(sel, timeout=6000)
             break
@@ -81,6 +81,27 @@ def _wait_for_content(page) -> None:
         pass
 
 
+def _parse_cookies_env(cookies_str: str, domain: str = ".aliyun.com") -> List[Dict]:
+    """
+    将 Cookie 字符串(浏览器复制的 name=value; name2=value2 格式)
+    解析为 Playwright set_cookies 所需的列表格式。
+    """
+    cookies = []
+    for part in cookies_str.split(";"):
+        part = part.strip()
+        if not part:
+            continue
+        if "=" in part:
+            name, _, value = part.partition("=")
+            cookies.append({
+                "name": name.strip(),
+                "value": value.strip(),
+                "domain": domain,
+                "path": "/",
+            })
+    return cookies
+
+
 def scrape_all(
     url: str,
     headless: bool = True,
@@ -109,6 +130,9 @@ def scrape_all(
     # 其余模块共享一个浏览器实例
     shared_modules = [m for m in modules if m != "price"]
 
+    # 从环境变量读取登录 Cookie
+    aliyun_cookies_str = os.environ.get("ALIYUN_COOKIES", "").strip()
+
     # ── 共享浏览器:info / rate / tool ──────────────────────────────────────────
     if shared_modules:
         api_data: List[Dict] = []
@@ -130,28 +154,51 @@ def scrape_all(
             context_kwargs: Dict = {}
             if api_key:
                 context_kwargs["extra_http_headers"] = {"Authorization": f"Bearer {api_key}"}
-            page = browser.new_context(**context_kwargs).new_page()
+            context = browser.new_context(**context_kwargs)
 
-            # 拦截 API 响应
-            def on_response(resp):
+            # 注入登录 Cookie(避免被重定向到登录/免费试用页)
+            if aliyun_cookies_str:
+                cookies = _parse_cookies_env(aliyun_cookies_str)
+                if cookies:
+                    context.add_cookies(cookies)
+                    print(f"[INFO] 已注入 {len(cookies)} 个 Cookie")
+
+            page = context.new_page()
+
+            # 只拦截匹配 INFO_API_RE 的 JSON API 请求,其余直接放行
+            # 避免对图片/日志等请求调用 route.fetch() 导致 DNS 失败崩溃
+            import json as _json
+
+            def handle_api_route(route, request):
                 try:
-                    if "application/json" not in resp.headers.get("content-type", ""):
-                        return
-                    if not INFO_API_RE.search(resp.url):
-                        return
+                    resp = route.fetch()
                     try:
-                        api_data.append(resp.json())
+                        ct = resp.headers.get("content-type", "")
+                        if "application/json" in ct:
+                            api_data.append(_json.loads(resp.body()))
+                    except Exception:
+                        pass
+                    route.fulfill(response=resp)
+                except Exception as e:
+                    try:
+                        route.continue_()
                     except Exception:
                         pass
-                except Exception:
-                    pass
 
-            page.on("response", on_response)
+            # 只对匹配 API 的 URL 注册拦截,其余请求不拦截(直接走浏览器默认行为)
+            page.route(
+                lambda url: bool(INFO_API_RE.search(url)),
+                handle_api_route,
+            )
 
             if not _navigate(page, url, timeout):
                 result["error"] = "导航失败"
                 browser.close()
             else:
+                try:
+                    page.wait_for_load_state("networkidle", timeout=20000)
+                except PlaywrightTimeoutError:
+                    pass
                 _wait_for_content(page)
 
                 # 从 API 找模型对象
@@ -160,7 +207,6 @@ def scrape_all(
                     found = _find_model_in_json(body, target)
                     if found:
                         model_obj = found
-                        print(f"[INFO] API 找到模型: {found.get('model', found.get('name', target))}")
                         break
 
                 if not model_obj:
@@ -189,21 +235,101 @@ def scrape_all(
                     icon = _extract_icon_from_page(page)
                     result["icon"] = icon.get("data") if icon.get("type") != "none" else None
 
+                # ── price 模块(复用共享浏览器) ──
+                if "price" in modules:
+                    try:
+                        from scrape_aliyun_models import (
+                            extract_price_items_from_html,
+                            extract_price_block_html,
+                            parse_prices_from_text,
+                            _ensure_tiered_pricing,
+                            _get_tier_options,
+                            _select_tier_option,
+                            _normalize_tier_option,
+                        )
+                        import time as _time
+
+                        _ensure_tiered_pricing(page)
+                        tier_options = _get_tier_options(page)
+                        tiered_items = []
+                        if tier_options:
+                            for opt in tier_options:
+                                if not _select_tier_option(page, opt):
+                                    continue
+                                html = page.content()
+                                try:
+                                    tier_items = extract_price_items_from_html(html)
+                                except Exception:
+                                    tier_items = []
+                                for it in tier_items:
+                                    it["tier"] = opt
+                                tiered_items.extend(tier_items)
+
+                        if tiered_items:
+                            items = tiered_items
+                        else:
+                            html = page.content()
+                            items = extract_price_items_from_html(html)
+                            if not items:
+                                text_block = extract_price_block_html(html)
+                                items = parse_prices_from_text(text_block) if text_block else []
+
+                        # 构建 price_map(复用 scrape_model_price 里的逻辑)
+                        def _build_price_map(parsed_items):
+                            price_map = {}
+                            for it in parsed_items:
+                                if isinstance(it, dict) and it.get("tiers") and isinstance(it.get("tiers"), dict):
+                                    for tier_key, tier_val in it["tiers"].items():
+                                        k = _normalize_tier_option(tier_key)
+                                        price_map.setdefault(k, {})
+                                        sub_label = tier_val.get("label") or tier_val.get("raw") or k
+                                        price_map[k][sub_label] = {kk: v for kk, v in tier_val.items() if kk not in ("tier", "tiers", "label")}
+                                    continue
+                                if it.get("tier"):
+                                    tk = _normalize_tier_option(it.get("tier"))
+                                    price_map.setdefault(tk, {})
+                                    sub_label = it.get("label") or it.get("raw") or tk
+                                    price_map[tk][sub_label] = {kk: v for kk, v in it.items() if kk not in ("tier", "label")}
+                                    continue
+                                lbl = it.get("label") or it.get("raw") or "price"
+                                if lbl in price_map and not isinstance(price_map[lbl], list):
+                                    price_map[lbl] = [price_map[lbl]]
+                                if isinstance(price_map.get(lbl), list):
+                                    price_map[lbl].append({kk: v for kk, v in it.items() if kk != "label"})
+                                else:
+                                    price_map[lbl] = {kk: v for kk, v in it.items() if kk != "label"}
+                            return price_map
+
+                        result["prices"] = _build_price_map(items)
+                    except Exception as e:
+                        import traceback as _tb
+                        print(f"[ERROR] 价格模块异常: {e}\n{_tb.format_exc()}")
+                        result["prices"] = {}
+                        result["price_error"] = str(e)
+
                 browser.close()
 
-    # ── price 模块(原始脚本,独立浏览器) ──────────────────────────────────────
-    if "price" in modules:
-        print(f"[INFO] 运行价格模块...")
-        price_result = scrape_model_price(
-            url,
-            headless=headless,
-            timeout=timeout,
-            executable_path=executable_path,
-            api_key=api_key,
-        )
-        result["prices"] = price_result.get("prices", {})
-        if price_result.get("error"):
-            result["price_error"] = price_result["error"]
+    # ── price 模块回退:若 shared_modules 为空(不含 info/rate/tool),独立启动浏览器 ──
+    if "price" in modules and not shared_modules:
+        print(f"[INFO] 运行价格模块(独立浏览器)...")
+        try:
+            price_result = scrape_model_price(
+                url,
+                headless=headless,
+                timeout=timeout,
+                executable_path=executable_path,
+                api_key=api_key,
+                cookies_str=aliyun_cookies_str,
+            )
+            result["prices"] = price_result.get("prices", {})
+            if price_result.get("error"):
+                result["price_error"] = price_result["error"]
+                print(f"[WARN] 价格模块错误: {price_result['error']}")
+        except Exception as e:
+            import traceback as _tb
+            print(f"[ERROR] 价格模块异常: {e}\n{_tb.format_exc()}")
+            result["prices"] = {}
+            result["price_error"] = str(e)
 
     return result
 

+ 27 - 2
backend/crawl/scrape_aliyun_models.py

@@ -628,7 +628,7 @@ def extract_price_items_global(html: str) -> List[Dict]:
     return parse_prices_from_text(ancestor.get_text(separator="\n"))
 
 
-def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, executable_path: Optional[str] = None, api_key: Optional[str] = None) -> Dict:
+def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, executable_path: Optional[str] = None, api_key: Optional[str] = None, cookies_str: Optional[str] = None) -> Dict:
     result = {"url": url, "error": None, "items": []}
 
     with sync_playwright() as p:
@@ -648,6 +648,26 @@ def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, ex
         if api_key:
             context_kwargs["extra_http_headers"] = {"Authorization": f"Bearer {api_key}"}
         context = browser.new_context(**context_kwargs)
+
+        # 注入登录 Cookie(避免被重定向到登录/免费试用页)
+        _cookies_str = cookies_str or os.environ.get("ALIYUN_COOKIES", "").strip()
+        if _cookies_str:
+            cookies = []
+            for part in _cookies_str.split(";"):
+                part = part.strip()
+                if not part or "=" not in part:
+                    continue
+                name, _, value = part.partition("=")
+                cookies.append({
+                    "name": name.strip(),
+                    "value": value.strip(),
+                    "domain": ".aliyun.com",
+                    "path": "/",
+                })
+            if cookies:
+                context.add_cookies(cookies)
+                print(f"[INFO][price] 已注入 {len(cookies)} 个 Cookie")
+
         page = context.new_page()
 
         network_hits = []
@@ -680,7 +700,7 @@ def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, ex
         page.on("console", _on_console)
         page.on("response", _on_response)
         try:
-            page.goto(url, wait_until="networkidle", timeout=timeout)
+            page.goto(url, wait_until="domcontentloaded", timeout=timeout)
         except PlaywrightTimeoutError:
             try:
                 page.goto(url, wait_until="load", timeout=timeout)
@@ -689,6 +709,11 @@ def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, ex
                 browser.close()
                 return result
 
+        try:
+            page.wait_for_load_state("networkidle", timeout=20000)
+        except PlaywrightTimeoutError:
+            pass
+
         try:
             page.wait_for_selector("text=模型价格", timeout=8000)
         except PlaywrightTimeoutError:

+ 34 - 20
backend/crawl/scrape_model_info.py

@@ -68,7 +68,9 @@ ALL_FEATURES_ORDERED: List[str] = [
 
 # 需要拦截的 API URL 关键词
 API_URL_RE = re.compile(
-    r"listFoundationModels|listRecommendedModels|listFeaturedModels|getModelDetail|modelCenter",
+    r"listFoundationModels|listRecommendedModels|listFeaturedModels|getModelDetail|modelCenter"
+    r"|BroadScopeAspnGateway"
+    r"|bailian-cs\.console\.aliyun\.com/data/api",
     re.I,
 )
 
@@ -115,29 +117,41 @@ def _merge_with_items(obj: Dict) -> Dict:
 def _find_model_in_json(data: Any, target: str) -> Optional[Dict]:
     """
     递归在 JSON 数据中查找与 target 匹配的模型对象。
-    匹配规则:model 或 name 字段去掉 group- 前缀后与 target 完全相等(优先),
-    或 target 是 model_val 的完整前缀(如 qwen3-max 匹配 qwen3-max-0919)。
+    匹配规则(优先级从高到低):
+      1. model 字段去掉 group- 前缀后与 target 完全相等(精确匹配)
+      2. target 是 model_val 的完整前缀(如 glm-5.1 匹配 glm-5.1-0419)
+      3. model_val 是 target 的完整前缀(如 qwen3-max 匹配 qwen3-max-0919)
     找到后自动用 items[0] 补充缺失字段。
     """
     clean_target = re.sub(r"^group-", "", target.lower())
 
-    if isinstance(data, dict):
-        model_val = re.sub(r"^group-", "", str(data.get("model", "")).lower())
-        name_val = str(data.get("name", "")).lower()
-        # 精确匹配 model 字段
-        is_match = (model_val == clean_target)
-        if is_match and ("model" in data or "name" in data):
-            return _merge_with_items(data)
-        for v in data.values():
-            found = _find_model_in_json(v, target)
-            if found:
-                return found
-    elif isinstance(data, list):
-        for item in data:
-            found = _find_model_in_json(item, target)
-            if found:
-                return found
-    return None
+    # 两轮扫描:第一轮精确匹配,第二轮前缀匹配
+    exact_match: Optional[Dict] = None
+    prefix_match: Optional[Dict] = None
+
+    def _scan(node: Any) -> None:
+        nonlocal exact_match, prefix_match
+        if isinstance(node, dict):
+            model_val = re.sub(r"^group-", "", str(node.get("model", "")).lower())
+            if model_val and ("model" in node or "name" in node):
+                if model_val == clean_target:
+                    if exact_match is None:
+                        exact_match = _merge_with_items(node)
+                    return
+                # 前缀匹配:target 是 model_val 的前缀(glm-5.1 → glm-5.1-0419)
+                # 或 model_val 是 target 的前缀(qwen3-max → qwen3-max-0919)
+                if (model_val.startswith(clean_target + "-") or
+                        clean_target.startswith(model_val + "-")):
+                    if prefix_match is None:
+                        prefix_match = _merge_with_items(node)
+            for v in node.values():
+                _scan(v)
+        elif isinstance(node, list):
+            for item in node:
+                _scan(item)
+
+    _scan(data)
+    return exact_match or prefix_match
 
 
 def parse_model_info(model_obj: Dict) -> Dict:

+ 7 - 1
backend/main.py

@@ -9,4 +9,10 @@ logging.basicConfig(
 )
 
 if __name__ == "__main__":
-    uvicorn.run("app.main:app", host=settings.host, port=settings.port, reload=True)
+    uvicorn.run(
+        "app.main:app",
+        host=settings.host,
+        port=settings.port,
+        reload=True,
+        reload_dirs=["app", "crawl"],  # 同时监听 crawl 目录
+    )

+ 2 - 1
frontend/.env

@@ -1,4 +1,5 @@
 #测试
 VITE_API_BASE_URL=http://localhost:8000
 #生产
-# VITE_API_BASE_URL=https://crawler-api.aitoolcore.com
+# VITE_API_BASE_URL=https://crawler-api.aitoolcore.com
+# VITE_API_BASE_URL=http://8.137.98.23:8000

+ 10 - 0
frontend/src/pages/Scraper.css

@@ -515,6 +515,16 @@
   margin-bottom: 6px;
 }
 
+.info-error {
+  font-size: 11px;
+  color: var(--neon-red);
+  background: rgba(255, 68, 102, 0.08);
+  border: 1px solid rgba(255, 68, 102, 0.3);
+  border-radius: 3px;
+  padding: 4px 8px;
+  margin-bottom: 6px;
+}
+
 .tag-row {
   display: flex;
   flex-wrap: wrap;

+ 4 - 1
frontend/src/pages/Scraper.tsx

@@ -17,6 +17,9 @@ function PriceCard({ result }: { result: NonNullable<ScrapeJobDetail['results']>
       {model_info && (
         <div className="info-section">
           <div className="info-section-title">模型信息</div>
+          {model_info.error && (
+            <div className="info-error">⚠ {model_info.error}</div>
+          )}
           {model_info.display_tags && model_info.display_tags.length > 0 && (
             <div className="tag-row">
               {model_info.display_tags.map(t => <span key={t} className="tag">{t}</span>)}
@@ -246,7 +249,7 @@ export function Scraper() {
                 <input type="checkbox" checked={selected.has(m.id)} onChange={() => toggleSelect(m.id)} />
                 <span className="model-name" title={m.url}>{m.name}</span>
               </label>
-              {m.api_key && <span className="model-key-badge" title="已配置 API Key">🔑</span>}
+              {m.api_key_id && <span className="model-key-badge" title="已配置 API Key">🔑</span>}
             </li>
           ))}
         </ul>