2 месяцев назад · 4f2fb38a9e
--- a/backend/.env
+++ b/backend/.env
@@ -9,10 +9,11 @@ DB_NAME=crawl_test
 
				 
			
 
				 ALLOWED_ORIGINS=http://localhost:5173
			
 
				 # ALLOWED_ORIGINS=https://crawler.aitoolcore.com
			
 
				+# ALLOWED_ORIGINS=http://8.137.98.23:5173
			
 
				 GEOIP_DB_PATH=./GeoLite2-City.mmdb
			
 
				 #本地
			
 
				 PLAYWRIGHT_EXECUTABLE=D:\playwright-browsers\chromium-1208\chrome-win64\chrome.exe
			
 
				-# #生产
			
 
				+#生产
			
 
				 # PLAYWRIGHT_EXECUTABLE=/www/wwwroot/playwright/chromium-1045/chrome-linux/chrome
			
 
				 PLAYWRIGHT_HEADLESS=true
			
 
				 
			
--- a/backend/app/services/scraper.py
+++ b/backend/app/services/scraper.py
@@ -21,6 +21,13 @@ for _p in (_backend_root, _crawl_dir):
 
				 
			
 
				 from main import scrape_all  # noqa: E402  (backend/crawl/main.py)
			
 
				 
			
 
				+# 启动时打印当前 INFO_API_RE，用于确认加载的是最新代码
			
 
				+import main as _crawl_main
			
 
				+import logging as _logging
			
 
				+_logging.getLogger(__name__).info(
			
 
				+    f"[scraper] INFO_API_RE = {_crawl_main.INFO_API_RE.pattern[:60]}..."
			
 
				+)
			
 
				+
			
 
				 
			
 
				 class ScraperService:
			
 
				     """Manages the lifecycle of a scrape job."""
			
--- a/backend/crawl/main.py
+++ b/backend/crawl/main.py
@@ -51,7 +51,7 @@ from scrape_model_icon import _extract_icon_from_page
 
				 def _navigate(page, url: str, timeout: int) -> bool:
			
 
				     """导航到 URL，返回是否成功。"""
			
 
				     try:
			
 
				-        page.goto(url, wait_until="networkidle", timeout=timeout)
			
 
				+        page.goto(url, wait_until="domcontentloaded", timeout=timeout)
			
 
				         return True
			
 
				     except PlaywrightTimeoutError:
			
 
				         try:
			
@@ -64,7 +64,7 @@ def _navigate(page, url: str, timeout: int) -> bool:
 
				 
			
 
				 def _wait_for_content(page) -> None:
			
 
				     """等待页面核心内容渲染完成。"""
			
 
				-    for sel in ["text=模型价格", "text=模型介绍", "text=模型能力"]:
			
 
				+    for sel in ["text=模型价格", "text=模型介绍", "text=模型能力", "text=模型限流"]:
			
 
				         try:
			
 
				             page.wait_for_selector(sel, timeout=6000)
			
 
				             break
			
@@ -81,6 +81,27 @@ def _wait_for_content(page) -> None:
 
				         pass
			
 
				 
			
 
				 
			
 
				+def _parse_cookies_env(cookies_str: str, domain: str = ".aliyun.com") -> List[Dict]:
			
 
				+    """
			
 
				+    将 Cookie 字符串（浏览器复制的 name=value; name2=value2 格式）
			
 
				+    解析为 Playwright set_cookies 所需的列表格式。
			
 
				+    """
			
 
				+    cookies = []
			
 
				+    for part in cookies_str.split(";"):
			
 
				+        part = part.strip()
			
 
				+        if not part:
			
 
				+            continue
			
 
				+        if "=" in part:
			
 
				+            name, _, value = part.partition("=")
			
 
				+            cookies.append({
			
 
				+                "name": name.strip(),
			
 
				+                "value": value.strip(),
			
 
				+                "domain": domain,
			
 
				+                "path": "/",
			
 
				+            })
			
 
				+    return cookies
			
 
				+
			
 
				+
			
 
				 def scrape_all(
			
 
				     url: str,
			
 
				     headless: bool = True,
			
@@ -109,6 +130,9 @@ def scrape_all(
 
				     # 其余模块共享一个浏览器实例
			
 
				     shared_modules = [m for m in modules if m != "price"]
			
 
				 
			
 
				+    # 从环境变量读取登录 Cookie
			
 
				+    aliyun_cookies_str = os.environ.get("ALIYUN_COOKIES", "").strip()
			
 
				+
			
 
				     # ── 共享浏览器：info / rate / tool ──────────────────────────────────────────
			
 
				     if shared_modules:
			
 
				         api_data: List[Dict] = []
			
@@ -130,28 +154,51 @@ def scrape_all(
 
				             context_kwargs: Dict = {}
			
 
				             if api_key:
			
 
				                 context_kwargs["extra_http_headers"] = {"Authorization": f"Bearer {api_key}"}
			
 
				-            page = browser.new_context(**context_kwargs).new_page()
			
 
				+            context = browser.new_context(**context_kwargs)
			
 
				 
			
 
				-            # 拦截 API 响应
			
 
				-            def on_response(resp):
			
 
				+            # 注入登录 Cookie（避免被重定向到登录/免费试用页）
			
 
				+            if aliyun_cookies_str:
			
 
				+                cookies = _parse_cookies_env(aliyun_cookies_str)
			
 
				+                if cookies:
			
 
				+                    context.add_cookies(cookies)
			
 
				+                    print(f"[INFO] 已注入 {len(cookies)} 个 Cookie")
			
 
				+
			
 
				+            page = context.new_page()
			
 
				+
			
 
				+            # 只拦截匹配 INFO_API_RE 的 JSON API 请求，其余直接放行
			
 
				+            # 避免对图片/日志等请求调用 route.fetch() 导致 DNS 失败崩溃
			
 
				+            import json as _json
			
 
				+
			
 
				+            def handle_api_route(route, request):
			
 
				                 try:
			
 
				-                    if "application/json" not in resp.headers.get("content-type", ""):
			
 
				-                        return
			
 
				-                    if not INFO_API_RE.search(resp.url):
			
 
				-                        return
			
 
				+                    resp = route.fetch()
			
 
				                     try:
			
 
				-                        api_data.append(resp.json())
			
 
				+                        ct = resp.headers.get("content-type", "")
			
 
				+                        if "application/json" in ct:
			
 
				+                            api_data.append(_json.loads(resp.body()))
			
 
				+                    except Exception:
			
 
				+                        pass
			
 
				+                    route.fulfill(response=resp)
			
 
				+                except Exception as e:
			
 
				+                    try:
			
 
				+                        route.continue_()
			
 
				                     except Exception:
			
 
				                         pass
			
 
				-                except Exception:
			
 
				-                    pass
			
 
				 
			
 
				-            page.on("response", on_response)
			
 
				+            # 只对匹配 API 的 URL 注册拦截，其余请求不拦截（直接走浏览器默认行为）
			
 
				+            page.route(
			
 
				+                lambda url: bool(INFO_API_RE.search(url)),
			
 
				+                handle_api_route,
			
 
				+            )
			
 
				 
			
 
				             if not _navigate(page, url, timeout):
			
 
				                 result["error"] = "导航失败"
			
 
				                 browser.close()
			
 
				             else:
			
 
				+                try:
			
 
				+                    page.wait_for_load_state("networkidle", timeout=20000)
			
 
				+                except PlaywrightTimeoutError:
			
 
				+                    pass
			
 
				                 _wait_for_content(page)
			
 
				 
			
 
				                 # 从 API 找模型对象
			
@@ -160,7 +207,6 @@ def scrape_all(
 
				                     found = _find_model_in_json(body, target)
			
 
				                     if found:
			
 
				                         model_obj = found
			
 
				-                        print(f"[INFO] API 找到模型: {found.get('model', found.get('name', target))}")
			
 
				                         break
			
 
				 
			
 
				                 if not model_obj:
			
@@ -189,21 +235,101 @@ def scrape_all(
 
				                     icon = _extract_icon_from_page(page)
			
 
				                     result["icon"] = icon.get("data") if icon.get("type") != "none" else None
			
 
				 
			
 
				+                # ── price 模块（复用共享浏览器） ──
			
 
				+                if "price" in modules:
			
 
				+                    try:
			
 
				+                        from scrape_aliyun_models import (
			
 
				+                            extract_price_items_from_html,
			
 
				+                            extract_price_block_html,
			
 
				+                            parse_prices_from_text,
			
 
				+                            _ensure_tiered_pricing,
			
 
				+                            _get_tier_options,
			
 
				+                            _select_tier_option,
			
 
				+                            _normalize_tier_option,
			
 
				+                        )
			
 
				+                        import time as _time
			
 
				+
			
 
				+                        _ensure_tiered_pricing(page)
			
 
				+                        tier_options = _get_tier_options(page)
			
 
				+                        tiered_items = []
			
 
				+                        if tier_options:
			
 
				+                            for opt in tier_options:
			
 
				+                                if not _select_tier_option(page, opt):
			
 
				+                                    continue
			
 
				+                                html = page.content()
			
 
				+                                try:
			
 
				+                                    tier_items = extract_price_items_from_html(html)
			
 
				+                                except Exception:
			
 
				+                                    tier_items = []
			
 
				+                                for it in tier_items:
			
 
				+                                    it["tier"] = opt
			
 
				+                                tiered_items.extend(tier_items)
			
 
				+
			
 
				+                        if tiered_items:
			
 
				+                            items = tiered_items
			
 
				+                        else:
			
 
				+                            html = page.content()
			
 
				+                            items = extract_price_items_from_html(html)
			
 
				+                            if not items:
			
 
				+                                text_block = extract_price_block_html(html)
			
 
				+                                items = parse_prices_from_text(text_block) if text_block else []
			
 
				+
			
 
				+                        # 构建 price_map（复用 scrape_model_price 里的逻辑）
			
 
				+                        def _build_price_map(parsed_items):
			
 
				+                            price_map = {}
			
 
				+                            for it in parsed_items:
			
 
				+                                if isinstance(it, dict) and it.get("tiers") and isinstance(it.get("tiers"), dict):
			
 
				+                                    for tier_key, tier_val in it["tiers"].items():
			
 
				+                                        k = _normalize_tier_option(tier_key)
			
 
				+                                        price_map.setdefault(k, {})
			
 
				+                                        sub_label = tier_val.get("label") or tier_val.get("raw") or k
			
 
				+                                        price_map[k][sub_label] = {kk: v for kk, v in tier_val.items() if kk not in ("tier", "tiers", "label")}
			
 
				+                                    continue
			
 
				+                                if it.get("tier"):
			
 
				+                                    tk = _normalize_tier_option(it.get("tier"))
			
 
				+                                    price_map.setdefault(tk, {})
			
 
				+                                    sub_label = it.get("label") or it.get("raw") or tk
			
 
				+                                    price_map[tk][sub_label] = {kk: v for kk, v in it.items() if kk not in ("tier", "label")}
			
 
				+                                    continue
			
 
				+                                lbl = it.get("label") or it.get("raw") or "price"
			
 
				+                                if lbl in price_map and not isinstance(price_map[lbl], list):
			
 
				+                                    price_map[lbl] = [price_map[lbl]]
			
 
				+                                if isinstance(price_map.get(lbl), list):
			
 
				+                                    price_map[lbl].append({kk: v for kk, v in it.items() if kk != "label"})
			
 
				+                                else:
			
 
				+                                    price_map[lbl] = {kk: v for kk, v in it.items() if kk != "label"}
			
 
				+                            return price_map
			
 
				+
			
 
				+                        result["prices"] = _build_price_map(items)
			
 
				+                    except Exception as e:
			
 
				+                        import traceback as _tb
			
 
				+                        print(f"[ERROR] 价格模块异常: {e}\n{_tb.format_exc()}")
			
 
				+                        result["prices"] = {}
			
 
				+                        result["price_error"] = str(e)
			
 
				+
			
 
				                 browser.close()
			
 
				 
			
 
				-    # ── price 模块（原始脚本，独立浏览器） ──────────────────────────────────────
			
 
				-    if "price" in modules:
			
 
				-        print(f"[INFO] 运行价格模块...")
			
 
				-        price_result = scrape_model_price(
			
 
				-            url,
			
 
				-            headless=headless,
			
 
				-            timeout=timeout,
			
 
				-            executable_path=executable_path,
			
 
				-            api_key=api_key,
			
 
				-        )
			
 
				-        result["prices"] = price_result.get("prices", {})
			
 
				-        if price_result.get("error"):
			
 
				-            result["price_error"] = price_result["error"]
			
 
				+    # ── price 模块回退：若 shared_modules 为空（不含 info/rate/tool），独立启动浏览器 ──
			
 
				+    if "price" in modules and not shared_modules:
			
 
				+        print(f"[INFO] 运行价格模块（独立浏览器）...")
			
 
				+        try:
			
 
				+            price_result = scrape_model_price(
			
 
				+                url,
			
 
				+                headless=headless,
			
 
				+                timeout=timeout,
			
 
				+                executable_path=executable_path,
			
 
				+                api_key=api_key,
			
 
				+                cookies_str=aliyun_cookies_str,
			
 
				+            )
			
 
				+            result["prices"] = price_result.get("prices", {})
			
 
				+            if price_result.get("error"):
			
 
				+                result["price_error"] = price_result["error"]
			
 
				+                print(f"[WARN] 价格模块错误: {price_result['error']}")
			
 
				+        except Exception as e:
			
 
				+            import traceback as _tb
			
 
				+            print(f"[ERROR] 价格模块异常: {e}\n{_tb.format_exc()}")
			
 
				+            result["prices"] = {}
			
 
				+            result["price_error"] = str(e)
			
 
				 
			
 
				     return result
			
 
				 
			
--- a/backend/crawl/scrape_aliyun_models.py
+++ b/backend/crawl/scrape_aliyun_models.py
@@ -628,7 +628,7 @@ def extract_price_items_global(html: str) -> List[Dict]:
 
				     return parse_prices_from_text(ancestor.get_text(separator="\n"))
			
 
				 
			
 
				 
			
 
				-def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, executable_path: Optional[str] = None, api_key: Optional[str] = None) -> Dict:
			
 
				+def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, executable_path: Optional[str] = None, api_key: Optional[str] = None, cookies_str: Optional[str] = None) -> Dict:
			
 
				     result = {"url": url, "error": None, "items": []}
			
 
				 
			
 
				     with sync_playwright() as p:
			
@@ -648,6 +648,26 @@ def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, ex
 
				         if api_key:
			
 
				             context_kwargs["extra_http_headers"] = {"Authorization": f"Bearer {api_key}"}
			
 
				         context = browser.new_context(**context_kwargs)
			
 
				+
			
 
				+        # 注入登录 Cookie（避免被重定向到登录/免费试用页）
			
 
				+        _cookies_str = cookies_str or os.environ.get("ALIYUN_COOKIES", "").strip()
			
 
				+        if _cookies_str:
			
 
				+            cookies = []
			
 
				+            for part in _cookies_str.split(";"):
			
 
				+                part = part.strip()
			
 
				+                if not part or "=" not in part:
			
 
				+                    continue
			
 
				+                name, _, value = part.partition("=")
			
 
				+                cookies.append({
			
 
				+                    "name": name.strip(),
			
 
				+                    "value": value.strip(),
			
 
				+                    "domain": ".aliyun.com",
			
 
				+                    "path": "/",
			
 
				+                })
			
 
				+            if cookies:
			
 
				+                context.add_cookies(cookies)
			
 
				+                print(f"[INFO][price] 已注入 {len(cookies)} 个 Cookie")
			
 
				+
			
 
				         page = context.new_page()
			
 
				 
			
 
				         network_hits = []
			
@@ -680,7 +700,7 @@ def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, ex
 
				         page.on("console", _on_console)
			
 
				         page.on("response", _on_response)
			
 
				         try:
			
 
				-            page.goto(url, wait_until="networkidle", timeout=timeout)
			
 
				+            page.goto(url, wait_until="domcontentloaded", timeout=timeout)
			
 
				         except PlaywrightTimeoutError:
			
 
				             try:
			
 
				                 page.goto(url, wait_until="load", timeout=timeout)
			
@@ -689,6 +709,11 @@ def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, ex
 
				                 browser.close()
			
 
				                 return result
			
 
				 
			
 
				+        try:
			
 
				+            page.wait_for_load_state("networkidle", timeout=20000)
			
 
				+        except PlaywrightTimeoutError:
			
 
				+            pass
			
 
				+
			
 
				         try:
			
 
				             page.wait_for_selector("text=模型价格", timeout=8000)
			
 
				         except PlaywrightTimeoutError:
			
--- a/backend/crawl/scrape_model_info.py
+++ b/backend/crawl/scrape_model_info.py
@@ -68,7 +68,9 @@ ALL_FEATURES_ORDERED: List[str] = [
 
				 
			
 
				 # 需要拦截的 API URL 关键词
			
 
				 API_URL_RE = re.compile(
			
 
				-    r"listFoundationModels|listRecommendedModels|listFeaturedModels|getModelDetail|modelCenter",
			
 
				+    r"listFoundationModels|listRecommendedModels|listFeaturedModels|getModelDetail|modelCenter"
			
 
				+    r"|BroadScopeAspnGateway"
			
 
				+    r"|bailian-cs\.console\.aliyun\.com/data/api",
			
 
				     re.I,
			
 
				 )
			
 
				 
			
@@ -115,29 +117,41 @@ def _merge_with_items(obj: Dict) -> Dict:
 
				 def _find_model_in_json(data: Any, target: str) -> Optional[Dict]:
			
 
				     """
			
 
				     递归在 JSON 数据中查找与 target 匹配的模型对象。
			
 
				-    匹配规则：model 或 name 字段去掉 group- 前缀后与 target 完全相等（优先），
			
 
				-    或 target 是 model_val 的完整前缀（如 qwen3-max 匹配 qwen3-max-0919）。
			
 
				+    匹配规则（优先级从高到低）：
			
 
				+      1. model 字段去掉 group- 前缀后与 target 完全相等（精确匹配）
			
 
				+      2. target 是 model_val 的完整前缀（如 glm-5.1 匹配 glm-5.1-0419）
			
 
				+      3. model_val 是 target 的完整前缀（如 qwen3-max 匹配 qwen3-max-0919）
			
 
				     找到后自动用 items[0] 补充缺失字段。
			
 
				     """
			
 
				     clean_target = re.sub(r"^group-", "", target.lower())
			
 
				 
			
 
				-    if isinstance(data, dict):
			
 
				-        model_val = re.sub(r"^group-", "", str(data.get("model", "")).lower())
			
 
				-        name_val = str(data.get("name", "")).lower()
			
 
				-        # 精确匹配 model 字段
			
 
				-        is_match = (model_val == clean_target)
			
 
				-        if is_match and ("model" in data or "name" in data):
			
 
				-            return _merge_with_items(data)
			
 
				-        for v in data.values():
			
 
				-            found = _find_model_in_json(v, target)
			
 
				-            if found:
			
 
				-                return found
			
 
				-    elif isinstance(data, list):
			
 
				-        for item in data:
			
 
				-            found = _find_model_in_json(item, target)
			
 
				-            if found:
			
 
				-                return found
			
 
				-    return None
			
 
				+    # 两轮扫描：第一轮精确匹配，第二轮前缀匹配
			
 
				+    exact_match: Optional[Dict] = None
			
 
				+    prefix_match: Optional[Dict] = None
			
 
				+
			
 
				+    def _scan(node: Any) -> None:
			
 
				+        nonlocal exact_match, prefix_match
			
 
				+        if isinstance(node, dict):
			
 
				+            model_val = re.sub(r"^group-", "", str(node.get("model", "")).lower())
			
 
				+            if model_val and ("model" in node or "name" in node):
			
 
				+                if model_val == clean_target:
			
 
				+                    if exact_match is None:
			
 
				+                        exact_match = _merge_with_items(node)
			
 
				+                    return
			
 
				+                # 前缀匹配：target 是 model_val 的前缀（glm-5.1 → glm-5.1-0419）
			
 
				+                # 或 model_val 是 target 的前缀（qwen3-max → qwen3-max-0919）
			
 
				+                if (model_val.startswith(clean_target + "-") or
			
 
				+                        clean_target.startswith(model_val + "-")):
			
 
				+                    if prefix_match is None:
			
 
				+                        prefix_match = _merge_with_items(node)
			
 
				+            for v in node.values():
			
 
				+                _scan(v)
			
 
				+        elif isinstance(node, list):
			
 
				+            for item in node:
			
 
				+                _scan(item)
			
 
				+
			
 
				+    _scan(data)
			
 
				+    return exact_match or prefix_match
			
 
				 
			
 
				 
			
 
				 def parse_model_info(model_obj: Dict) -> Dict:
			
--- a/backend/main.py
+++ b/backend/main.py
@@ -9,4 +9,10 @@ logging.basicConfig(
 
				 )
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    uvicorn.run("app.main:app", host=settings.host, port=settings.port, reload=True)
			
 
				+    uvicorn.run(
			
 
				+        "app.main:app",
			
 
				+        host=settings.host,
			
 
				+        port=settings.port,
			
 
				+        reload=True,
			
 
				+        reload_dirs=["app", "crawl"],  # 同时监听 crawl 目录
			
 
				+    )
			
--- a/frontend/.env
+++ b/frontend/.env
@@ -1,4 +1,5 @@
 
				 #测试
			
 
				 VITE_API_BASE_URL=http://localhost:8000
			
 
				 #生产
			
 
				-# VITE_API_BASE_URL=https://crawler-api.aitoolcore.com
			
 
				+# VITE_API_BASE_URL=https://crawler-api.aitoolcore.com
			
 
				+# VITE_API_BASE_URL=http://8.137.98.23:8000
			
--- a/frontend/src/pages/Scraper.css
+++ b/frontend/src/pages/Scraper.css
@@ -515,6 +515,16 @@
 
				   margin-bottom: 6px;
			
 
				 }
			
 
				 
			
 
				+.info-error {
			
 
				+  font-size: 11px;
			
 
				+  color: var(--neon-red);
			
 
				+  background: rgba(255, 68, 102, 0.08);
			
 
				+  border: 1px solid rgba(255, 68, 102, 0.3);
			
 
				+  border-radius: 3px;
			
 
				+  padding: 4px 8px;
			
 
				+  margin-bottom: 6px;
			
 
				+}
			
 
				+
			
 
				 .tag-row {
			
 
				   display: flex;
			
 
				   flex-wrap: wrap;
			
--- a/frontend/src/pages/Scraper.tsx
+++ b/frontend/src/pages/Scraper.tsx
@@ -17,6 +17,9 @@ function PriceCard({ result }: { result: NonNullable<ScrapeJobDetail['results']>
 
				       {model_info && (
			
 
				         <div className="info-section">
			
 
				           <div className="info-section-title">模型信息</div>
			
 
				+          {model_info.error && (
			
 
				+            <div className="info-error">⚠ {model_info.error}</div>
			
 
				+          )}
			
 
				           {model_info.display_tags && model_info.display_tags.length > 0 && (
			
 
				             <div className="tag-row">
			
 
				               {model_info.display_tags.map(t => <span key={t} className="tag">{t}</span>)}
			
@@ -246,7 +249,7 @@ export function Scraper() {
 
				                 <input type="checkbox" checked={selected.has(m.id)} onChange={() => toggleSelect(m.id)} />
			
 
				                 <span className="model-name" title={m.url}>{m.name}</span>
			
 
				               </label>
			
 
				-              {m.api_key && <span className="model-key-badge" title="已配置 API Key">🔑</span>}
			
 
				+              {m.api_key_id && <span className="model-key-badge" title="已配置 API Key">🔑</span>}
			
 
				             </li>
			
 
				           ))}
			
 
				         </ul>