2 Commity e87cf14e5b ... afe5d4c91b

Autor SHA1 Wiadomość Data
  lxylxy123321 afe5d4c91b env修改 1 tydzień temu
  lxylxy123321 20a31a7991 新增版本号与playwright的win和linux兼容 1 tydzień temu

+ 8 - 0
backend/.env

@@ -8,6 +8,14 @@ DB_PASSWORD=wsNbzdnmPnpwCj56
 DB_NAME=crawl
 
 ALLOWED_ORIGINS=http://localhost:5173
+# ALLOWED_ORIGINS=https://crawler.aitoolcore.com
 GEOIP_DB_PATH=./GeoLite2-City.mmdb
+#本地
 PLAYWRIGHT_EXECUTABLE=D:\playwright-browsers\chromium-1208\chrome-win64\chrome.exe
+#生产
+# PLAYWRIGHT_EXECUTABLE=/www/wwwroot/playwright/playwright-browsers/chromium-1208/chrome-win64/chrome.exe
 PLAYWRIGHT_HEADLESS=true
+
+# Linux 生产环境 Chrome 额外启动参数(逗号分隔)
+# 解决 crashpad 崩溃问题:禁用 crash reporter,并指定 crash dumps 目录
+# PLAYWRIGHT_EXTRA_ARGS=--disable-crash-reporter,--crash-dumps-dir=/tmp

+ 5 - 0
backend/.env.example

@@ -10,3 +10,8 @@ DB_NAME=sentinel_lens
 ALLOWED_ORIGINS=http://localhost:5173
 GEOIP_DB_PATH=./GeoLite2-City.mmdb
 PLAYWRIGHT_EXECUTABLE=
+PLAYWRIGHT_HEADLESS=true
+
+# Linux 生产环境 Chrome 额外启动参数(逗号分隔)
+# 解决 crashpad 崩溃问题:禁用 crash reporter,并指定 crash dumps 目录
+# PLAYWRIGHT_EXTRA_ARGS=--disable-crash-reporter,--crash-dumps-dir=/tmp

+ 10 - 1
backend/app/main.py

@@ -16,8 +16,8 @@ from app.services.scheduler import start_scheduler, stop_scheduler
 @asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
     await init_pool()
-    # 建 users 表并初始化 admin
     pool = get_pool()
+    # 建 users 表并初始化 admin
     await pool.execute("""
         CREATE TABLE IF NOT EXISTS crawl.users (
             id           BIGSERIAL    PRIMARY KEY,
@@ -28,6 +28,15 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
     """)
     from app.routers.auth import ensure_admin_user
     await ensure_admin_user()
+    # 清理上次进程意外退出遗留的僵死任务
+    cleaned = await pool.execute(
+        """
+        UPDATE scrape_jobs SET status = 'failed', error = '服务重启,任务中断', updated_at = NOW()
+        WHERE status IN ('pending', 'running')
+        """
+    )
+    import logging
+    logging.getLogger(__name__).info(f"[startup] 清理僵死任务: {cleaned}")
     await start_scheduler()
     yield
     await stop_scheduler()

+ 55 - 45
backend/app/routers/public.py

@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from datetime import datetime
-from typing import List, Optional
+from typing import List, Optional, Union
 from urllib.parse import urlparse
 
 import json
@@ -60,6 +60,7 @@ class ModelTypeItem(BaseModel):
 
 
 class PricesResponse(BaseModel):
+    version: int
     models: List[PublicPriceOut]
     parsed_prices: List[ParsedPriceItem]
     discounted_prices: List[DiscountedPriceItem]
@@ -67,6 +68,11 @@ class PricesResponse(BaseModel):
     discount: float = 1.0
 
 
+class UpToDateResponse(BaseModel):
+    up_to_date: bool = True
+    version: int
+
+
 def _extract_domain(referer: Optional[str]) -> Optional[str]:
     if not referer:
         return None
@@ -76,20 +82,30 @@ def _extract_domain(referer: Optional[str]) -> Optional[str]:
         return None
 
 
-@router.get("/prices", response_model=PricesResponse)
-async def get_public_prices(request: Request, url: Optional[str] = None) -> PricesResponse:
+@router.get("/prices", response_model=Union[PricesResponse, UpToDateResponse])
+async def get_public_prices(
+    request: Request,
+    url: Optional[str] = None,
+) -> Union[PricesResponse, UpToDateResponse]:
     pool = get_pool()
 
+    # referer 必须提供
+    referer = request.headers.get("referer") or request.headers.get("origin")
+    if not referer:
+        raise HTTPException(status_code=400, detail="Missing Referer header")
+
+    # version 从 Header 读取,默认 0(首次请求)
+    try:
+        version = int(request.headers.get("version", "0") or "0")
+    except ValueError:
+        version = 0
+
     # 记录调用来源
     ip = request.client.host if request.client else "unknown"
-    referer = request.headers.get("referer") or request.headers.get("origin")
     geo = geo_resolver.resolve(ip)
     try:
         await pool.execute(
-            """
-            INSERT INTO price_api_logs (ip, referer, org, country, city)
-            VALUES ($1, $2, $3, $4, $5)
-            """,
+            "INSERT INTO price_api_logs (ip, referer, org, country, city) VALUES ($1, $2, $3, $4, $5)",
             ip, referer, geo.org, geo.country, geo.city,
         )
     except Exception:
@@ -99,9 +115,7 @@ async def get_public_prices(request: Request, url: Optional[str] = None) -> Pric
     caller_domain = _extract_domain(referer)
     discount_rate: Optional[float] = None
     if caller_domain:
-        row = await pool.fetchrow(
-            "SELECT discount FROM discounts WHERE domain = $1", caller_domain
-        )
+        row = await pool.fetchrow("SELECT discount FROM discounts WHERE domain = $1", caller_domain)
         if row:
             discount_rate = float(row["discount"])
 
@@ -110,39 +124,38 @@ async def get_public_prices(request: Request, url: Optional[str] = None) -> Pric
             return None
         return v if isinstance(v, (dict, list)) else json.loads(v)
 
+    # 读取全局版本号(0 表示尚未有任何快照)
+    ver_row = await pool.fetchrow("SELECT version FROM price_snapshot_version WHERE id = 1")
+    current_version: int = int(ver_row["version"]) if ver_row else 0
+
+    # version != 0 且与当前一致 → 无需更新(0 视为首次请求,强制返回数据)
+    if version != 0 and version == current_version:
+        return UpToDateResponse(up_to_date=True, version=current_version)
+
+    # 从 price_snapshot 读取数据
     if url is None:
         rows = await pool.fetch(
-            """
-            WITH latest_job AS (
-                SELECT id FROM scrape_jobs
-                WHERE status = 'done'
-                ORDER BY created_at DESC LIMIT 1
-            )
-            SELECT DISTINCT ON (r.url) r.url, r.model_name, r.prices,
-                   r.model_info, r.rate_limits, r.tool_prices, r.scraped_at
-            FROM scrape_results r
-            JOIN latest_job j ON r.job_id = j.id
-            ORDER BY r.url, r.scraped_at DESC
-            """
+            "SELECT url, model_name, prices, model_info, rate_limits, tool_prices, updated_at FROM price_snapshot ORDER BY url"
         )
     else:
         rows = await pool.fetch(
-            """
-            SELECT url, model_name, prices, model_info, rate_limits, tool_prices, scraped_at
-            FROM scrape_results
-            WHERE url = $1
-            ORDER BY scraped_at DESC LIMIT 1
-            """,
+            "SELECT url, model_name, prices, model_info, rate_limits, tool_prices, updated_at FROM price_snapshot WHERE url = $1",
             url,
         )
         if not rows:
-            raise HTTPException(status_code=404, detail="No scrape results found for the given URL")
+            raise HTTPException(status_code=404, detail="No price snapshot found for the given URL")
+
+    if not rows:
+        raise HTTPException(status_code=503, detail="Price snapshot not yet available")
+
+    # version != 0 且与当前一致 → 无需更新
+    if version != 0 and version == current_version:
+        return UpToDateResponse(up_to_date=True, version=current_version)
 
     def _extract_type(model_info: Optional[dict]) -> Optional[List[str]]:
         if not model_info:
             return None
         tags = model_info.get("display_tags", [])
-        # 只保留模型类型标签,排除系列名和能力标签(深度思考等)
         TYPE_TAGS = {"文本生成", "图像生成", "视觉理解", "音频理解", "视频理解", "视频生成", "向量表示", "语音识别", "语音合成"}
         result = [t for t in tags if t in TYPE_TAGS]
         return result if result else None
@@ -154,7 +167,7 @@ async def get_public_prices(request: Request, url: Optional[str] = None) -> Pric
         model_info=_j(r["model_info"]),
         rate_limits=_j(r["rate_limits"]),
         tool_prices=_j(r["tool_prices"]),
-        scraped_at=r["scraped_at"],
+        scraped_at=r["updated_at"],
     ) for r in rows]
 
     parsed_prices: List[ParsedPriceItem] = []
@@ -162,28 +175,25 @@ async def get_public_prices(request: Request, url: Optional[str] = None) -> Pric
 
     for r in rows:
         for item in parse_prices(_j(r["prices"]) or {}):
-            parsed_prices.append(ParsedPriceItem(
-                url=r["url"],
-                model_name=r["model_name"],
-                **item,
-            ))
-            # 折扣价:有折扣就乘,没有就原价(discount=None)
+            parsed_prices.append(ParsedPriceItem(url=r["url"], model_name=r["model_name"], **item))
             d_item = dict(item)
             if discount_rate is not None:
                 if d_item.get("input_price") is not None:
                     d_item["input_price"] = round(d_item["input_price"] * discount_rate, 6)
                 if d_item.get("output_price") is not None:
                     d_item["output_price"] = round(d_item["output_price"] * discount_rate, 6)
-            discounted_prices.append(DiscountedPriceItem(
-                url=r["url"],
-                model_name=r["model_name"],
-                discount=discount_rate,
-                **d_item,
-            ))
+            discounted_prices.append(DiscountedPriceItem(url=r["url"], model_name=r["model_name"], discount=discount_rate, **d_item))
 
     all_types = [
         ModelTypeItem(model_name=r["model_name"], type=_extract_type(_j(r["model_info"])) or [])
         for r in rows
     ]
 
-    return PricesResponse(models=models, parsed_prices=parsed_prices, discounted_prices=discounted_prices, types=all_types, discount=discount_rate if discount_rate is not None else 1.0)
+    return PricesResponse(
+        version=current_version,
+        models=models,
+        parsed_prices=parsed_prices,
+        discounted_prices=discounted_prices,
+        types=all_types,
+        discount=discount_rate if discount_rate is not None else 1.0,
+    )

+ 56 - 9
backend/app/services/scraper.py

@@ -38,6 +38,13 @@ class ScraperService:
             exec_path = os.environ.get("PLAYWRIGHT_EXECUTABLE") or None
             headless = os.environ.get("PLAYWRIGHT_HEADLESS", "true").lower() != "false"
 
+            def _norm(v) -> str:
+                if v is None:
+                    return "null"
+                return json.dumps(v if isinstance(v, (dict, list)) else json.loads(v), sort_keys=True)
+
+            any_changed = False
+
             for url in urls:
                 result: dict = await loop.run_in_executor(
                     None,
@@ -50,8 +57,8 @@ class ScraperService:
                     ),
                 )
 
-                prices     = result.get("prices") or {}
-                model_info = result.get("info") or {}
+                prices      = result.get("prices") or {}
+                model_info  = result.get("info") or {}
                 rate_limits = result.get("rate_limits") or {}
                 tool_prices = result.get("tool_call_prices") or []
 
@@ -68,16 +75,56 @@ class ScraperService:
                             (job_id, url, model_name, prices, model_info, rate_limits, tool_prices, raw_data)
                         VALUES ($1, $2, $3, $4::jsonb, $5::jsonb, $6::jsonb, $7::jsonb, $8::jsonb)
                         """,
-                        job_id,
-                        url,
-                        model_name,
-                        json.dumps(prices),
-                        json.dumps(model_info),
-                        json.dumps(rate_limits),
-                        json.dumps(tool_prices),
+                        job_id, url, model_name,
+                        json.dumps(prices), json.dumps(model_info),
+                        json.dumps(rate_limits), json.dumps(tool_prices),
                         json.dumps(result),
                     )
 
+                    # 对比旧快照,有变化才 upsert
+                    existing = await conn.fetchrow(
+                        "SELECT prices, model_info, rate_limits, tool_prices FROM price_snapshot WHERE url = $1",
+                        url,
+                    )
+                    data_changed = (
+                        existing is None
+                        or _norm(existing["prices"])      != _norm(prices)
+                        or _norm(existing["model_info"])  != _norm(model_info)
+                        or _norm(existing["rate_limits"]) != _norm(rate_limits)
+                        or _norm(existing["tool_prices"]) != _norm(tool_prices)
+                    )
+
+                    if data_changed:
+                        any_changed = True
+                        await conn.execute(
+                            """
+                            INSERT INTO price_snapshot
+                                (url, model_name, prices, model_info, rate_limits, tool_prices, updated_at)
+                            VALUES ($1, $2, $3::jsonb, $4::jsonb, $5::jsonb, $6::jsonb, NOW())
+                            ON CONFLICT (url) DO UPDATE SET
+                                model_name  = EXCLUDED.model_name,
+                                prices      = EXCLUDED.prices,
+                                model_info  = EXCLUDED.model_info,
+                                rate_limits = EXCLUDED.rate_limits,
+                                tool_prices = EXCLUDED.tool_prices,
+                                updated_at  = NOW()
+                            """,
+                            url, model_name,
+                            json.dumps(prices), json.dumps(model_info),
+                            json.dumps(rate_limits), json.dumps(tool_prices),
+                        )
+
+            # 本批次有任何数据变化,全局版本号 +1(从 1 开始)
+            if any_changed:
+                async with pool.acquire() as conn:
+                    await conn.execute(
+                        """
+                        UPDATE price_snapshot_version
+                        SET version = GREATEST(version + 1, 1), updated_at = NOW()
+                        WHERE id = 1
+                        """
+                    )
+
             async with pool.acquire() as conn:
                 await conn.execute(
                     "UPDATE scrape_jobs SET status = 'done', updated_at = NOW() WHERE id = $1",

+ 7 - 0
backend/crawl/main.py

@@ -111,6 +111,13 @@ def scrape_all(
             launch_kwargs: Dict = {"headless": headless}
             if executable_path:
                 launch_kwargs["executable_path"] = executable_path
+
+            # 额外 Chrome 启动参数(生产环境 Linux 可通过 PLAYWRIGHT_EXTRA_ARGS 注入)
+            extra_args_env = os.environ.get("PLAYWRIGHT_EXTRA_ARGS", "")
+            extra_args = [a.strip() for a in extra_args_env.split(",") if a.strip()]
+            if extra_args:
+                launch_kwargs["args"] = extra_args
+
             browser = p.chromium.launch(**launch_kwargs)
             page = browser.new_context().new_page()
 

+ 6 - 0
backend/crawl/scrape_aliyun_models.py

@@ -635,6 +635,12 @@ def scrape_model_price(url: str, headless: bool = True, timeout: int = 20000, ex
         launch_kwargs = {"headless": headless}
         if executable_path:
             launch_kwargs["executable_path"] = executable_path
+
+        extra_args_env = os.environ.get("PLAYWRIGHT_EXTRA_ARGS", "")
+        extra_args = [a.strip() for a in extra_args_env.split(",") if a.strip()]
+        if extra_args:
+            launch_kwargs["args"] = extra_args
+
         browser = p.chromium.launch(**launch_kwargs)
         context = browser.new_context()
         page = context.new_page()

+ 27 - 0
backend/migrations/010_price_snapshot.sql

@@ -0,0 +1,27 @@
+-- Migration 010: price_snapshot — canonical price data served by /prices API
+SET search_path TO crawl;
+
+-- 全局版本号,从 1 开始,每次快照有任何数据变化就 +1
+CREATE TABLE IF NOT EXISTS price_snapshot_version (
+    id         INT          PRIMARY KEY DEFAULT 1,
+    version    BIGINT       NOT NULL DEFAULT 0,
+    updated_at TIMESTAMPTZ  NOT NULL DEFAULT NOW(),
+    CONSTRAINT single_row CHECK (id = 1)
+);
+
+INSERT INTO price_snapshot_version (id, version) VALUES (1, 0)
+ON CONFLICT (id) DO NOTHING;
+
+-- 每个模型 URL 一行,不再存 version(version 统一在 price_snapshot_version 管理)
+CREATE TABLE IF NOT EXISTS price_snapshot (
+    url         TEXT         PRIMARY KEY,
+    model_name  VARCHAR(200) NOT NULL,
+    prices      JSONB        NOT NULL DEFAULT '{}',
+    model_info  JSONB,
+    rate_limits JSONB,
+    tool_prices JSONB,
+    updated_at  TIMESTAMPTZ  NOT NULL DEFAULT NOW()
+);
+
+COMMENT ON TABLE price_snapshot IS
+    'One row per model URL. Global version is tracked in price_snapshot_version.';

+ 31 - 10
docs/api.md

@@ -34,17 +34,27 @@ Base URL: `http://localhost:8000`
 
 ### GET /api/public/prices
 
-获取最新一次爬取任务的所有模型价格数据。调用时会自动记录来源 IP 和 Referer,并根据域名匹配折扣。
+获取价格快照数据。调用时会自动记录来源 IP 和 Referer,并根据域名匹配折扣。
+
+> 数据来源为 `price_snapshot` 表,仅在爬取结果与上次不同时才更新,版本号同步自增。
+
+**Headers(必填)**
+
+| Header  | 说明                                                                                 |
+|---------|--------------------------------------------------------------------------------------|
+| Referer | 调用方来源域名,缺少返回 400                                                          |
+| version | 客户端持有的版本号(整数)。不传或传 `0` 视为首次请求,强制返回数据;与服务端一致时返回 `up_to_date: true` |
 
 **Query Parameters**
 
-| 参数 | 类型   | 必填 | 说明                     |
-|------|--------|------|--------------------------|
-| url  | string | 否   | 指定单个模型页面 URL 过滤 |
+| 参数 | 类型   | 默认值 | 说明                      |
+|------|--------|--------|---------------------------|
+| url  | string | —      | 指定单个模型页面 URL 过滤  |
 
-**Response 200**
+**Response 200 — 有更新**
 ```json
 {
+  "version": 3,
   "models": [
     {
       "url": "https://bailian.console.aliyun.com/...",
@@ -56,7 +66,7 @@ Base URL: `http://localhost:8000`
         "description": "...",
         "input_modalities": ["Text"],
         "output_modalities": ["Text"],
-        "features": { "cache存储": true, "function calling": true, ... }
+        "features": { "cache存储": true, "function calling": true }
       },
       "rate_limits": { "RPM": "30000", "TPM": "10000000", "上下文长度": "1M" },
       "tool_prices": [{ "label": "联网搜索", "price": 0, "unit": "元/千次", "note": null }],
@@ -93,17 +103,28 @@ Base URL: `http://localhost:8000`
     }
   ],
   "types": [
-    { "model_name": "qwen-plus-latest", "type": ["文本生成"] },
-    { "model_name": "qwen-vl-plus", "type": ["视觉理解"] }
+    { "model_name": "qwen-plus-latest", "type": ["文本生成"] }
   ],
   "discount": 0.8
 }
 ```
 
+> `version` 为全局整数版本号,从 1 开始,每次爬取结果有变化时自增。  
 > `discount` 为调用方域名对应的折扣率,无折扣时为 `1.0`。
-> `discounted_prices` 中的价格已乘以折扣率。
 
-**Response 404** — 指定 url 无爬取结果
+**Response 200 — 无需更新(version 与服务端一致)**
+```json
+{
+  "up_to_date": true,
+  "version": 3
+}
+```
+
+**Response 400** — 缺少 Referer header
+
+**Response 404** — 指定 url 无快照数据
+
+**Response 503** — 快照尚未生成(未执行过爬取)
 
 ---
 

+ 3 - 0
frontend/.env

@@ -1 +1,4 @@
+#测试
 VITE_API_BASE_URL=http://localhost:8000
+#生产
+# VITE_API_BASE_URL=https://crawler-api.aitoolcore.com

+ 1 - 1
frontend/src/pages/Map.tsx

@@ -41,7 +41,7 @@ export function MapPage() {
       fillOpacity: 0.8,
     };
 
-    const addLabels = (geojson: any, layer: L.GeoJSON, labelGroup: L.LayerGroup) => {
+    const addLabels = (geojson: any, _layer: L.GeoJSON, labelGroup: L.LayerGroup) => {
       geojson.features.forEach((feature: any) => {
         const name = feature.properties?.name;
         const center = feature.properties?.centroid || feature.properties?.center;

+ 11 - 0
frontend/src/pages/Scraper.tsx

@@ -134,7 +134,18 @@ export function Scraper() {
 
   const startPolling = (jobId: string) => {
     stopPolling();
+    const deadline = Date.now() + 10 * 60 * 1000; // 最多轮询 10 分钟
     pollRef.current = setInterval(async () => {
+      if (Date.now() > deadline) {
+        stopPolling();
+        setExpandedJobs(prev => {
+          const job = prev[jobId];
+          if (!job || job.status === 'done' || job.status === 'failed') return prev;
+          return { ...prev, [jobId]: { ...job, status: 'failed', error: '轮询超时,请刷新页面查看最新状态' } };
+        });
+        loadHistory();
+        return;
+      }
       try {
         const detail = await fetchScrapeJob(jobId);
         setExpandedJobs(prev => ({ ...prev, [jobId]: detail }));