2 месяцев назад · 4f2fb38a9e
--- a/backend/.env
+++ b/backend/.env
@@ -9,10 +9,11 @@ DB_NAME=crawl_test
 
															 ALLOWED_ORIGINS=http://localhost:5173
														
 
															 # ALLOWED_ORIGINS=https://crawler.aitoolcore.com
														
 
															+# ALLOWED_ORIGINS=http://8.137.98.23:5173
														
 
															 GEOIP_DB_PATH=./GeoLite2-City.mmdb
														
 
															 #本地
														
 
															 PLAYWRIGHT_EXECUTABLE=D:\playwright-browsers\chromium-1208\chrome-win64\chrome.exe
														
 
															-# #生产
														
 
															+#生产
														
 
															 # PLAYWRIGHT_EXECUTABLE=/www/wwwroot/playwright/chromium-1045/chrome-linux/chrome
														
 
															 PLAYWRIGHT_HEADLESS=true
														
--- a/backend/app/services/scraper.py
+++ b/backend/app/services/scraper.py
@@ -21,6 +21,13 @@ for _p in (_backend_root, _crawl_dir):
 
															 from main import scrape_all  # noqa: E402  (backend/crawl/main.py)
														
 
															+# 启动时打印当前 INFO_API_RE，用于确认加载的是最新代码
														
 
															+import main as _crawl_main
														
 
															+import logging as _logging
														
 
															+_logging.getLogger(__name__).info(
														
 
															+    f"[scraper] INFO_API_RE = {_crawl_main.INFO_API_RE.pattern[:60]}..."
														
 
															+)
														
 
															+
														
 
															 class ScraperService:
														
 
															     """Manages the lifecycle of a scrape job."""
														
--- a/backend/crawl/main.py
+++ b/backend/crawl/main.py
@@ -51,7 +51,7 @@ from scrape_model_icon import _extract_icon_from_page
 
															 def _navigate(page, url: str, timeout: int) -> bool:
														
 
															     """导航到 URL，返回是否成功。"""
														
 
															     try:
														
 
															-        page.goto(url, wait_until="networkidle", timeout=timeout)
														
 
															+        page.goto(url, wait_until="domcontentloaded", timeout=timeout)
														
 
															         return True
														
 
															     except PlaywrightTimeoutError:
														
 
															         try:
														
@@ -64,7 +64,7 @@ def _navigate(page, url: str, timeout: int) -> bool:
 
															 def _wait_for_content(page) -> None:
														
 
															     """等待页面核心内容渲染完成。"""
														
 
															-    for sel in ["text=模型价格", "text=模型介绍", "text=模型能力"]:
														
 
															+    for sel in ["text=模型价格", "text=模型介绍", "text=模型能力", "text=模型限流"]:
														
 
															         try:
														
 
															             page.wait_for_selector(sel, timeout=6000)
														
 
															             break
														
@@ -81,6 +81,27 @@ def _wait_for_content(page) -> None:
 
															         pass
														
 
															+def _parse_cookies_env(cookies_str: str, domain: str = ".aliyun.com") -> List[Dict]:
														
 
															+    """
														
 
															+    将 Cookie 字符串（浏览器复制的 name=value; name2=value2 格式）
														
 
															+    解析为 Playwright set_cookies 所需的列表格式。
														
 
															+    """
														
 
															+    cookies = []
														
 
															+    for part in cookies_str.split(";"):
														
 
															+        part = part.strip()
														
 
															+        if not part:
														
 
															+            continue
														
 
															+        if "=" in part:
														
 
															+            name, _, value = part.partition("=")
														
 
															+            cookies.append({
														
 
															+                "name": name.strip(),
														
 
															+                "value": value.strip(),
														
 
															+                "domain": domain,
														
 
															+                "path": "/",
														
 
															+            })
														
 
															+    return cookies
														
 
															+
														
 
															+
														
 
															 def scrape_all(
														
 
															     url: str,
														
 
															     headless: bool = True,
														
@@ -109,6 +130,9 @@ def scrape_all(
 
															     # 其余模块共享一个浏览器实例
														
 
															     shared_modules = [m for m in modules if m != "price"]
														
 
															+    # 从环境变量读取登录 Cookie
														
 
															+    aliyun_cookies_str = os.environ.get("ALIYUN_COOKIES", "").strip()
														
 
															+
														
 
															     # ── 共享浏览器：info / rate / tool ──────────────────────────────────────────
														
 
															     if shared_modules:
														
 
															         api_data: List[Dict] = []
														
@@ -130,28 +154,51 @@ def scrape_all(
 
															             context_kwargs: Dict = {}
														
 
															             if api_key:
														
 
															                 context_kwargs["extra_http_headers"] = {"Authorization": f"Bearer {api_key}"}
														
 
															-            page = browser.new_context(**context_kwargs).new_page()
														
 
															+            context = browser.new_context(**context_kwargs)
														
 
															-            # 拦截 API 响应
														
 
															-            def on_response(resp):
														
 
															+            # 注入登录 Cookie（避免被重定向到登录/免费试用页）
														
 
															+            if aliyun_cookies_str:
														
 
															+                cookies = _parse_cookies_env(aliyun_cookies_str)
														
 
															+                if cookies:
														
 
															+                    context.add_cookies(cookies)
														
 
															+                    print(f"[INFO] 已注入 {len(cookies)} 个 Cookie")
														
 
															+
														
 
															+            page = context.new_page()
														
 
															+
														
 
															+            # 只拦截匹配 INFO_API_RE 的 JSON API 请求，其余直接放行
														
 
															+            # 避免对图片/日志等请求调用 route.fetch() 导致 DNS 失败崩溃
														
 
															+            import json as _json
														
 
															+
														
 
															+            def handle_api_route(route, request):
														
 
															                 try:
														
 
															-                    if "application/json" not in resp.headers.get("content-type", ""):
														
 
															-                        return
														
 
															-                    if not INFO_API_RE.search(resp.url):
														
 
															-                        return
														
 
															+                    resp = route.fetch()
														
 
															                     try:
														
 
															-                        api_data.append(resp.json())
														
 
															+                        ct = resp.headers.get("content-type", "")
														
 
															+                        if "application/json" in ct:
														
 
															+                            api_data.append(_json.loads(resp.body()))
														
 
															+                    except Exception:
														
 
															+                        pass
														
 
															+                    route.fulfill(response=resp)
														
 
															+                except Exception as e:
														
 
															+                    try:
														
 
															+                        route.continue_()
														
 
															                     except Exception:
														
 
															                         pass
														
 
															-                except Exception:
														
 
															-                    pass
														
 
															-            page.on("response", on_response)
														
 
															+            # 只对匹配 API 的 URL 注册拦截，其余请求不拦截（直接走浏览器默认行为）
														
 
															+            page.route(
														
 
															+                lambda url: bool(INFO_API_RE.search(url)),
														
 
															+                handle_api_route,
														
 
															+            )
														
 
															             if not _navigate(page, url, timeout):
														
 
															                 result["error"] = "导航失败"
														
 
															                 browser.close()
														
 
															             else:
														
 
															+                try:
														
 
															+                    page.wait_for_load_state("networkidle", timeout=20000)
														
 
															+                except PlaywrightTimeoutError:
														
 
															+                    pass
														
 
															                 _wait_for_content(page)
														
 
															                 # 从 API 找模型对象
														
@@ -160,7 +207,6 @@ def scrape_all(
 
															                     found = _find_model_in_json(body, target)
														
 
															                     if found:
														
 
															                         model_obj = found
														
 
															-                        print(f"[INFO] API 找到模型: {found.get('model', found.get('name', target))}")
														
 
															                         break
														
 
															                 if not model_obj:
														
@@ -189,21 +235,101 @@ def scrape_all(
 
															                     icon = _extract_icon_from_page(page)
														
 
															                     result["icon"] = icon.get("data") if icon.get("type") != "none" else None
														
 
															+                # ── price 模块（复用共享浏览器） ──
														
 
															+                if "price" in modules:
														
 
															+                    try:
														
 
															+                        from scrape_aliyun_models import (
														
 
															+                            extract_price_items_from_html,
														
 
															+                            extract_price_block_html,
														
 
															+                            parse_prices_from_text,
														
 
															+                            _ensure_tiered_pricing,
														
 
															+                            _get_tier_options,
														
 
															+                            _select_tier_option,
														
 
															+                            _normalize_tier_option,
														
 
															+                        )
														
 
															+                        import time as _time
														
 
															+
														
 
															+                        _ensure_tiered_pricing(page)
														
 
															+                        tier_options = _get_tier_options(page)
														
 
															+                        tiered_items = []
														
 
															+                        if tier_options:
														
 
															+                            for opt in tier_options:
														
 
															+                                if not _select_tier_option(page, opt):
														
 
															+                                    continue
														
 
															+                                html = page.content()
														
 
															+                                try:
														
 
															+                                    tier_items = extract_price_items_from_html(html)
														
 
															+                                except Exception:
														
 
															+                                    tier_items = []
														
 
															+                                for it in tier_items:
														
 
															+                                    it["tier"] = opt
														
 
															+                                tiered_items.extend(tier_items)
														
 
															+
														
 
															+                        if tiered_items:
														
 
															+                            items = tiered_items
														
 
															+                        else:
														
 
															+                            html = page.content()
														
 
															+                            items = extract_price_items_from_html(html)
														
 
															+                            if not items:
														
 
															+                                text_block = extract_price_block_html(html)
														
 
															+                                items = parse_prices_from_text(text_block) if text_block else []
														
 
															+
														
 
															+                        # 构建 price_map（复用 scrape_model_price 里的逻辑）
														
 
															+                        def _build_price_map(parsed_items):
														
 
															+                            price_map = {}
														
 
															+                            for it in parsed_items:
														
 
															+                                if isinstance(it, dict) and it.get("tiers") and isinstance(it.get("tiers"), dict):
														
 
															+                                    for tier_key, tier_val in it["tiers"].items():
														
 
															+                                        k = _normalize_tier_option(tier_key)
														
 
															+                                        price_map.setdefault(k, {})
														
 
															+                                        sub_label = tier_val.get("label") or tier_val.get("raw") or k
														
 
															+                                        price_map[k][sub_label] = {kk: v for kk, v in tier_val.items() if kk not in ("tier", "tiers", "label")}
														
 
															+                                    continue
														
 
															+                                if it.get("tier"):
														
 
															+                                    tk = _normalize_tier_option(it.get("tier"))
														
 
															+                                    price_map.setdefault(tk, {})
														
 
															+                                    sub_label = it.get("label") or it.get("raw") or tk
														
 
															+                                    price_map[tk][sub_label] = {kk: v for kk, v in it.items() if kk not in ("tier", "label")}
														
 
															+                                    continue
														
 
															+                                lbl = it.get("label") or it.get("raw") or "price"
														
 
															+                                if lbl in price_map and not isinstance(price_map[lbl], list):
														
 
															+                                    price_map[lbl] = [price_map[lbl]]
														
 
															+                                if isinstance(price_map.get(lbl), list):
														
 
															+                                    price_map[lbl].append({kk: v for kk, v in it.items() if kk != "label"})
														
 
															+                                else:
														
 
															+                                    price_map[lbl] = {kk: v for kk, v in it.items() if kk != "label"}
														
 
															+                            return price_map
														
 
															+
														
 
															+                        result["prices"] = _build_price_map(items)
														
 
															+                    except Exception as e:
														
 
															+                        import traceback as _tb
														
 
															+                        print(f"[ERROR] 价格模块异常: {e}\n{_tb.format_exc()}")
														
 
															+                        result["prices"] = {}
														
 
															+                        result["price_error"] = str(e)
														
 
															+
														
 
															                 browser.close()
														
 
															-    # ── price 模块（原始脚本，独立浏览器） ──────────────────────────────────────
														
 
															-    if "price" in modules:
														
 
															-        print(f"[INFO] 运行价格模块...")
														
 
															-        price_result = scrape_model_price(
														
 
															-            url,
														
 
															-            headless=headless,
														
 
															-            timeout=timeout,
														
 
															-            executable_path=executable_path,
														
 
															-            api_key=api_key,
														
 
															-        )
														
 
															-        result["prices"] = price_result.get("prices", {})
														
 
															-        if price_result.get("error"):
														
 
															-            result["price_error"] = price_result["error"]
														
 
															+    # ── price 模块回退：若 shared_modules 为空（不含 info/rate/tool），独立启动浏览器 ──
														
 
															+    if "price" in modules and not shared_modules:
														
 
															+        print(f"[INFO] 运行价格模块（独立浏览器）...")
														
 
															+        try:
														
 
															+            price_result = scrape_model_price(
														
 
															+                url,
														
 
															+                headless=headless,
														
 
															+                timeout=timeout,
														
 
															+                executable_path=executable_path,
														
 
															+                api_key=api_key,
														
 
															+                cookies_str=aliyun_cookies_str,
														
 
															+            )
														
 
															+            result["prices"] = price_result.get("prices", {})
														
 
															+            if price_result.get("error"):
														
 
															+                result["price_error"] = price_result["error"]
														
 
															+                print(f"[WARN] 价格模块错误: {price_result['error']}")
														
 
															+        except Exception as e:
														
 
															+            import traceback as _tb
														
 
															+            print(f"[ERROR] 价格模块异常: {e}\n{_tb.format_exc()}")
														
 
															+            result["prices"] = {}
														
 
															+            result["price_error"] = str(e)
														
 
															     return result
														
--- a/backend/crawl/scrape_aliyun_models.py
+++ b/backend/crawl/scrape_aliyun_models.py
@@ -628,7 +628,7 @@ def extract_price_items_global(html: str) -> List[Dict]:
 
															     return parse_prices_from_text(ancestor.get_text(separator="\n"))
														
 
															-def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, executable_path: Optional[str] = None, api_key: Optional[str] = None) -> Dict:
														
 
															+def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, executable_path: Optional[str] = None, api_key: Optional[str] = None, cookies_str: Optional[str] = None) -> Dict:
														
 
															     result = {"url": url, "error": None, "items": []}
														
 
															     with sync_playwright() as p:
														
@@ -648,6 +648,26 @@ def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, ex
 
															         if api_key:
														
 
															             context_kwargs["extra_http_headers"] = {"Authorization": f"Bearer {api_key}"}
														
 
															         context = browser.new_context(**context_kwargs)
														
 
															+
														
 
															+        # 注入登录 Cookie（避免被重定向到登录/免费试用页）
														
 
															+        _cookies_str = cookies_str or os.environ.get("ALIYUN_COOKIES", "").strip()
														
 
															+        if _cookies_str:
														
 
															+            cookies = []
														
 
															+            for part in _cookies_str.split(";"):
														
 
															+                part = part.strip()
														
 
															+                if not part or "=" not in part:
														
 
															+                    continue
														
 
															+                name, _, value = part.partition("=")
														
 
															+                cookies.append({
														
 
															+                    "name": name.strip(),
														
 
															+                    "value": value.strip(),
														
 
															+                    "domain": ".aliyun.com",
														
 
															+                    "path": "/",
														
 
															+                })
														
 
															+            if cookies:
														
 
															+                context.add_cookies(cookies)
														
 
															+                print(f"[INFO][price] 已注入 {len(cookies)} 个 Cookie")
														
 
															+
														
 
															         page = context.new_page()
														
 
															         network_hits = []
														
@@ -680,7 +700,7 @@ def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, ex
 
															         page.on("console", _on_console)
														
 
															         page.on("response", _on_response)
														
 
															         try:
														
 
															-            page.goto(url, wait_until="networkidle", timeout=timeout)
														
 
															+            page.goto(url, wait_until="domcontentloaded", timeout=timeout)
														
 
															         except PlaywrightTimeoutError:
														
 
															             try:
														
 
															                 page.goto(url, wait_until="load", timeout=timeout)
														
@@ -689,6 +709,11 @@ def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, ex
 
															                 browser.close()
														
 
															                 return result
														
 
															+        try:
														
 
															+            page.wait_for_load_state("networkidle", timeout=20000)
														
 
															+        except PlaywrightTimeoutError:
														
 
															+            pass
														
 
															+
														
 
															         try:
														
 
															             page.wait_for_selector("text=模型价格", timeout=8000)
														
 
															         except PlaywrightTimeoutError:
														
--- a/backend/crawl/scrape_model_info.py
+++ b/backend/crawl/scrape_model_info.py
@@ -68,7 +68,9 @@ ALL_FEATURES_ORDERED: List[str] = [
 
															 # 需要拦截的 API URL 关键词
														
 
															 API_URL_RE = re.compile(
														
 
															-    r"listFoundationModels|listRecommendedModels|listFeaturedModels|getModelDetail|modelCenter",
														
 
															+    r"listFoundationModels|listRecommendedModels|listFeaturedModels|getModelDetail|modelCenter"
														
 
															+    r"|BroadScopeAspnGateway"
														
 
															+    r"|bailian-cs\.console\.aliyun\.com/data/api",
														
 
															     re.I,
														
 
															 )
														
@@ -115,29 +117,41 @@ def _merge_with_items(obj: Dict) -> Dict:
 
															 def _find_model_in_json(data: Any, target: str) -> Optional[Dict]:
														
 
															     """
														
 
															     递归在 JSON 数据中查找与 target 匹配的模型对象。
														
 
															-    匹配规则：model 或 name 字段去掉 group- 前缀后与 target 完全相等（优先），
														
 
															-    或 target 是 model_val 的完整前缀（如 qwen3-max 匹配 qwen3-max-0919）。
														
 
															+    匹配规则（优先级从高到低）：
														
 
															+      1. model 字段去掉 group- 前缀后与 target 完全相等（精确匹配）
														
 
															+      2. target 是 model_val 的完整前缀（如 glm-5.1 匹配 glm-5.1-0419）
														
 
															+      3. model_val 是 target 的完整前缀（如 qwen3-max 匹配 qwen3-max-0919）
														
 
															     找到后自动用 items[0] 补充缺失字段。
														
 
															     """
														
 
															     clean_target = re.sub(r"^group-", "", target.lower())
														
 
															-    if isinstance(data, dict):
														
 
															-        model_val = re.sub(r"^group-", "", str(data.get("model", "")).lower())
														
 
															-        name_val = str(data.get("name", "")).lower()
														
 
															-        # 精确匹配 model 字段
														
 
															-        is_match = (model_val == clean_target)
														
 
															-        if is_match and ("model" in data or "name" in data):
														
 
															-            return _merge_with_items(data)
														
 
															-        for v in data.values():
														
 
															-            found = _find_model_in_json(v, target)
														
 
															-            if found:
														
 
															-                return found
														
 
															-    elif isinstance(data, list):
														
 
															-        for item in data:
														
 
															-            found = _find_model_in_json(item, target)
														
 
															-            if found:
														
 
															-                return found
														
 
															-    return None
														
 
															+    # 两轮扫描：第一轮精确匹配，第二轮前缀匹配
														
 
															+    exact_match: Optional[Dict] = None
														
 
															+    prefix_match: Optional[Dict] = None
														
 
															+
														
 
															+    def _scan(node: Any) -> None:
														
 
															+        nonlocal exact_match, prefix_match
														
 
															+        if isinstance(node, dict):
														
 
															+            model_val = re.sub(r"^group-", "", str(node.get("model", "")).lower())
														
 
															+            if model_val and ("model" in node or "name" in node):
														
 
															+                if model_val == clean_target:
														
 
															+                    if exact_match is None:
														
 
															+                        exact_match = _merge_with_items(node)
														
 
															+                    return
														
 
															+                # 前缀匹配：target 是 model_val 的前缀（glm-5.1 → glm-5.1-0419）
														
 
															+                # 或 model_val 是 target 的前缀（qwen3-max → qwen3-max-0919）
														
 
															+                if (model_val.startswith(clean_target + "-") or
														
 
															+                        clean_target.startswith(model_val + "-")):
														
 
															+                    if prefix_match is None:
														
 
															+                        prefix_match = _merge_with_items(node)
														
 
															+            for v in node.values():
														
 
															+                _scan(v)
														
 
															+        elif isinstance(node, list):
														
 
															+            for item in node:
														
 
															+                _scan(item)
														
 
															+
														
 
															+    _scan(data)
														
 
															+    return exact_match or prefix_match
														
 
															 def parse_model_info(model_obj: Dict) -> Dict:
														
--- a/backend/main.py
+++ b/backend/main.py
@@ -9,4 +9,10 @@ logging.basicConfig(
 
															 )
														
 
															 if __name__ == "__main__":
														
 
															-    uvicorn.run("app.main:app", host=settings.host, port=settings.port, reload=True)
														
 
															+    uvicorn.run(
														
 
															+        "app.main:app",
														
 
															+        host=settings.host,
														
 
															+        port=settings.port,
														
 
															+        reload=True,
														
 
															+        reload_dirs=["app", "crawl"],  # 同时监听 crawl 目录
														
 
															+    )
														
--- a/frontend/.env
+++ b/frontend/.env
@@ -1,4 +1,5 @@
 
															 #测试
														
 
															 VITE_API_BASE_URL=http://localhost:8000
														
 
															 #生产
														
 
															-# VITE_API_BASE_URL=https://crawler-api.aitoolcore.com
														
 
															+# VITE_API_BASE_URL=https://crawler-api.aitoolcore.com
														
 
															+# VITE_API_BASE_URL=http://8.137.98.23:8000
														
--- a/frontend/src/pages/Scraper.css
+++ b/frontend/src/pages/Scraper.css
@@ -515,6 +515,16 @@
 
															   margin-bottom: 6px;
														
 
															 }
														
 
															+.info-error {
														
 
															+  font-size: 11px;
														
 
															+  color: var(--neon-red);
														
 
															+  background: rgba(255, 68, 102, 0.08);
														
 
															+  border: 1px solid rgba(255, 68, 102, 0.3);
														
 
															+  border-radius: 3px;
														
 
															+  padding: 4px 8px;
														
 
															+  margin-bottom: 6px;
														
 
															+}
														
 
															+
														
 
															 .tag-row {
														
 
															   display: flex;
														
 
															   flex-wrap: wrap;
														
--- a/frontend/src/pages/Scraper.tsx
+++ b/frontend/src/pages/Scraper.tsx
@@ -17,6 +17,9 @@ function PriceCard({ result }: { result: NonNullable<ScrapeJobDetail['results']>
 
															       {model_info && (
														
 
															         <div className="info-section">
														
 
															           <div className="info-section-title">模型信息</div>
														
 
															+          {model_info.error && (
														
 
															+            <div className="info-error">⚠ {model_info.error}</div>
														
 
															+          )}
														
 
															           {model_info.display_tags && model_info.display_tags.length > 0 && (
														
 
															             <div className="tag-row">
														
 
															               {model_info.display_tags.map(t => <span key={t} className="tag">{t}</span>)}
														
@@ -246,7 +249,7 @@ export function Scraper() {
 
															                 <input type="checkbox" checked={selected.has(m.id)} onChange={() => toggleSelect(m.id)} />
														
 
															                 <span className="model-name" title={m.url}>{m.name}</span>
														
 
															               </label>
														
 
															-              {m.api_key && <span className="model-key-badge" title="已配置 API Key">🔑</span>}
														
 
															+              {m.api_key_id && <span className="model-key-badge" title="已配置 API Key">🔑</span>}
														
 
															             </li>
														
 
															           ))}
														
 
															         </ul>