scrape_stats.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. from __future__ import annotations
  2. from typing import List, Optional
  3. from datetime import date
  4. from fastapi import APIRouter
  5. from pydantic import BaseModel
  6. from app.db import get_pool
  7. router = APIRouter(tags=["scrape-stats"])
  8. class ScrapeOverview(BaseModel):
  9. total_jobs: int
  10. success_jobs: int
  11. failed_jobs: int
  12. total_models_scraped: int
  13. last_scraped_at: Optional[str] = None
  14. class DailyCount(BaseModel):
  15. date: str
  16. count: int
  17. class ModelRankItem(BaseModel):
  18. model_name: str
  19. count: int
  20. class RecentJob(BaseModel):
  21. job_id: str
  22. status: str
  23. model_count: int
  24. created_at: str
  25. class ScrapeStatsOut(BaseModel):
  26. overview: ScrapeOverview
  27. daily_counts: List[DailyCount]
  28. model_ranks: List[ModelRankItem]
  29. recent_jobs: List[RecentJob]
  30. @router.get("/scrape-stats", response_model=ScrapeStatsOut)
  31. async def get_scrape_stats() -> ScrapeStatsOut:
  32. pool = get_pool()
  33. # 总览
  34. overview_row = await pool.fetchrow("""
  35. SELECT
  36. COUNT(*) AS total_jobs,
  37. COUNT(*) FILTER (WHERE status = 'done') AS success_jobs,
  38. COUNT(*) FILTER (WHERE status = 'failed') AS failed_jobs,
  39. MAX(updated_at) AS last_scraped_at
  40. FROM scrape_jobs
  41. """)
  42. total_models = await pool.fetchval(
  43. "SELECT COUNT(DISTINCT url) FROM scrape_results"
  44. ) or 0
  45. overview = ScrapeOverview(
  46. total_jobs=overview_row["total_jobs"] or 0,
  47. success_jobs=overview_row["success_jobs"] or 0,
  48. failed_jobs=overview_row["failed_jobs"] or 0,
  49. total_models_scraped=total_models,
  50. last_scraped_at=overview_row["last_scraped_at"].isoformat() if overview_row["last_scraped_at"] else None,
  51. )
  52. # 近30天每日爬取次数
  53. daily_rows = await pool.fetch("""
  54. SELECT DATE(created_at) AS day, COUNT(*) AS cnt
  55. FROM scrape_jobs
  56. WHERE created_at >= NOW() - INTERVAL '30 days'
  57. GROUP BY day
  58. ORDER BY day
  59. """)
  60. daily_counts = [DailyCount(date=str(r["day"]), count=r["cnt"]) for r in daily_rows]
  61. # 模型爬取排行 Top 15
  62. rank_rows = await pool.fetch("""
  63. SELECT model_name, COUNT(*) AS cnt
  64. FROM scrape_results
  65. GROUP BY model_name
  66. ORDER BY cnt DESC
  67. LIMIT 15
  68. """)
  69. model_ranks = [ModelRankItem(model_name=r["model_name"], count=r["cnt"]) for r in rank_rows]
  70. # 最近10条任务
  71. recent_rows = await pool.fetch("""
  72. SELECT j.id, j.status, j.created_at,
  73. COUNT(r.id) AS model_count
  74. FROM scrape_jobs j
  75. LEFT JOIN scrape_results r ON r.job_id = j.id
  76. GROUP BY j.id, j.status, j.created_at
  77. ORDER BY j.created_at DESC
  78. LIMIT 10
  79. """)
  80. recent_jobs = [
  81. RecentJob(
  82. job_id=str(r["id"]),
  83. status=r["status"],
  84. model_count=r["model_count"],
  85. created_at=r["created_at"].isoformat(),
  86. )
  87. for r in recent_rows
  88. ]
  89. return ScrapeStatsOut(
  90. overview=overview,
  91. daily_counts=daily_counts,
  92. model_ranks=model_ranks,
  93. recent_jobs=recent_jobs,
  94. )