Parcourir la source

添加部署信息

kinglee il y a 2 semaines
Parent
commit
20f4ddc6bf

+ 30 - 7
docker-compose/docker-compose.external-observability.yaml

@@ -1,11 +1,33 @@
 services:
 services:
+  postgres:
+    image: postgres:16
+    container_name: gpustack-db
+    restart: unless-stopped
+    environment:
+      POSTGRES_USER: gpustack
+      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-gpustack}
+      POSTGRES_DB: gpustack
+    volumes:
+      - postgres-data:/var/lib/postgresql/data
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U gpustack"]
+      interval: 5s
+      timeout: 5s
+      retries: 5
+
   gpustack-server:
   gpustack-server:
-    image: ${IMAGE_REGISTRY:-docker.io}/${GPUSTACK_IMAGE_NAMESPACE:-gpustack}/gpustack:${GPUSTACK_TAG:-latest}
+    build:
+      context: ..
+      dockerfile: pack/Dockerfile
     container_name: gpustack-server
     container_name: gpustack-server
     restart: unless-stopped
     restart: unless-stopped
+    depends_on:
+      postgres:
+        condition: service_healthy
     environment:
     environment:
-      - GPUSTACK_DISABLE_BUILTIN_OBSERVABILITY=true
-      - GPUSTACK_GRAFANA_URL=${GPUSTACK_GRAFANA_URL:?set to browser-reachable Grafana URL}
+      GPUSTACK_DATABASE_URL: postgresql://gpustack:${POSTGRES_PASSWORD:-gpustack}@postgres:5432/gpustack
+      GPUSTACK_DISABLE_BUILTIN_OBSERVABILITY: "true"
+      GPUSTACK_GRAFANA_URL: ${GPUSTACK_GRAFANA_URL:-http://localhost:3000}
     volumes:
     volumes:
       - gpustack-data:/var/lib/gpustack
       - gpustack-data:/var/lib/gpustack
     ports:
     ports:
@@ -32,14 +54,15 @@ services:
     ports:
     ports:
       - "3000:3000"
       - "3000:3000"
     environment:
     environment:
-      - GF_SERVER_HTTP_PORT=3000
-      - GF_SECURITY_ADMIN_USER=admin
-      - GF_SECURITY_ADMIN_PASSWORD=grafana
-      - GF_FEATURE_TOGGLES_ENABLE=flameGraph traceqlSearch traceQLStreaming correlations metricsSummary traceqlEditor traceToMetrics traceToProfiles
+      GF_SERVER_HTTP_PORT: "3000"
+      GF_SECURITY_ADMIN_USER: admin
+      GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_PASSWORD:-grafana}
+      GF_FEATURE_TOGGLES_ENABLE: flameGraph traceqlSearch traceQLStreaming correlations metricsSummary traceqlEditor traceToMetrics traceToProfiles
     volumes:
     volumes:
       - ./grafana/grafana_provisioning:/etc/grafana/provisioning:ro
       - ./grafana/grafana_provisioning:/etc/grafana/provisioning:ro
       - ./grafana/grafana_dashboards:/etc/dashboards:ro
       - ./grafana/grafana_dashboards:/etc/dashboards:ro
 
 
 volumes:
 volumes:
+  postgres-data: {}
   prom_data: {}
   prom_data: {}
   gpustack-data: {}
   gpustack-data: {}

+ 25 - 1
docker-compose/docker-compose.server.yaml

@@ -1,12 +1,36 @@
 services:
 services:
+  postgres:
+    image: postgres:16
+    container_name: gpustack-db
+    restart: unless-stopped
+    environment:
+      POSTGRES_USER: gpustack
+      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-gpustack}
+      POSTGRES_DB: gpustack
+    volumes:
+      - postgres-data:/var/lib/postgresql/data
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U gpustack"]
+      interval: 5s
+      timeout: 5s
+      retries: 5
+
   gpustack-server:
   gpustack-server:
-    image: ${IMAGE_REGISTRY:-docker.io}/${IMAGE_NAMESPACE:-gpustack}/gpustack:${GPUSTACK_TAG:-latest}
+    build:
+      context: ..
+      dockerfile: pack/Dockerfile
     container_name: gpustack-server
     container_name: gpustack-server
     restart: unless-stopped
     restart: unless-stopped
+    depends_on:
+      postgres:
+        condition: service_healthy
+    environment:
+      GPUSTACK_DATABASE_URL: postgresql://gpustack:${POSTGRES_PASSWORD:-gpustack}@postgres:5432/gpustack
     volumes:
     volumes:
       - gpustack-data:/var/lib/gpustack
       - gpustack-data:/var/lib/gpustack
     ports:
     ports:
       - "80:80"
       - "80:80"
 
 
 volumes:
 volumes:
+  postgres-data: {}
   gpustack-data: {}
   gpustack-data: {}

+ 210 - 0
docs/deployment-from-source-docker.md

@@ -0,0 +1,210 @@
+# 基于源码的 Docker 部署指南
+
+Higress 已内置于 Docker 镜像中(通过 s6-overlay 管理),无需单独部署。
+
+---
+
+## 一、构建镜像
+
+### 环境要求
+
+- Linux(x86_64 或 arm64)
+- Docker 24.0+,启用 BuildKit
+- Git
+
+### 1. 克隆代码
+
+```bash
+git clone <your-repo-url> /opt/gpustack-src
+cd /opt/gpustack-src
+```
+
+### 2. 初始化 buildx(首次执行)
+
+```bash
+docker run --rm --privileged tonistiigi/binfmt:qemu-v9.2.2-52 --install all
+docker buildx create \
+    --name gpustack \
+    --driver docker-container \
+    --driver-opt "network=host,default-load=true" \
+    --bootstrap
+```
+
+### 3. 构建镜像
+
+```bash
+# 使用项目脚本构建(推荐)
+PACKAGE_TAG=my-build PACKAGE_PUSH=false bash hack/package.sh
+```
+
+构建完成后镜像名为 `gpustack/gpustack:my-build`。
+
+也可以直接用 docker buildx:
+
+```bash
+docker buildx build \
+    --builder gpustack \
+    --platform linux/amd64 \
+    --tag gpustack/gpustack:my-build \
+    --file pack/Dockerfile \
+    --ulimit nofile=65536:65536 \
+    --shm-size 16G \
+    --load \
+    .
+```
+
+> 构建时间较长(30~60 分钟),会下载 Higress、Prometheus、Grafana 等组件。
+
+---
+
+## 二、部署 Server
+
+Server 负责 API、调度、数据库、Gateway,是集群的控制节点。
+
+### 1. 进入 docker-compose 目录
+
+```bash
+cd /opt/gpustack-src/docker-compose
+```
+
+### 2. 创建 `.env` 文件
+
+```bash
+cat > .env <<EOF
+POSTGRES_PASSWORD=your_strong_password
+EOF
+```
+
+### 3. 启动 Server
+
+```bash
+docker compose -f docker-compose.server.yaml up -d --build
+```
+
+### 4. 查看初始管理员密码
+
+```bash
+docker exec gpustack-server cat /var/lib/gpustack/initial_admin_password
+```
+
+### 5. 获取 Worker 注册 Token
+
+Worker 节点加入集群时需要此 Token:
+
+```bash
+docker exec gpustack-server cat /var/lib/gpustack/token
+```
+
+### 6. 访问
+
+浏览器打开 `http://<Server-IP>`,使用 `admin` 和初始密码登录。
+
+---
+
+## 三、部署 Worker
+
+Worker 负责运行模型推理实例,可部署在多台 GPU 机器上。
+
+> 前提:Server 已启动并可访问。
+
+### 1. 在 Worker 机器上克隆代码并构建镜像
+
+```bash
+git clone <your-repo-url> /opt/gpustack-src
+cd /opt/gpustack-src
+
+PACKAGE_TAG=my-build PACKAGE_PUSH=false bash hack/package.sh
+```
+
+### 2. 启动 Worker 容器
+
+```bash
+docker run -d \
+    --name gpustack-worker \
+    --restart unless-stopped \
+    --ulimit nofile=65535:65535 \
+    -v gpustack-worker-data:/var/lib/gpustack \
+    gpustack/gpustack:my-build \
+    --server-url http://<Server-IP> \
+    --token <上一步获取的Token>
+```
+
+### 3. 验证 Worker 注册
+
+在 Server 的 Web UI 中查看 Workers 页面,确认新 Worker 已上线。
+
+---
+
+## 四、含监控部署(Prometheus + Grafana)
+
+```bash
+cat > .env <<EOF
+POSTGRES_PASSWORD=your_strong_password
+GRAFANA_PASSWORD=your_grafana_password
+GPUSTACK_GRAFANA_URL=http://<Server-IP>:3000
+EOF
+
+docker compose -f docker-compose.external-observability.yaml up -d --build
+```
+
+| 服务 | 地址 | 默认账号 |
+|------|------|----------|
+| GPUStack | `http://<IP>:80` | admin / 见 initial_admin_password |
+| Grafana | `http://<IP>:3000` | admin / 见 .env |
+| Prometheus | `http://<IP>:9090` | - |
+
+---
+
+## 五、常用运维命令
+
+```bash
+# 查看 Server 日志
+docker logs -f gpustack-server
+
+# 查看 Worker 日志
+docker logs -f gpustack-worker
+
+# 重启 Server
+docker compose -f docker-compose.server.yaml restart gpustack-server
+
+# 重启 Worker
+docker restart gpustack-worker
+
+# 重新构建并更新 Server
+PACKAGE_TAG=new-build bash hack/package.sh
+docker compose -f docker-compose.server.yaml up -d --build
+
+# 重新构建并更新 Worker
+PACKAGE_TAG=new-build bash hack/package.sh
+docker rm -f gpustack-worker
+docker run -d --name gpustack-worker ... # 同上启动命令
+
+# 停止所有服务
+docker compose -f docker-compose.server.yaml down
+```
+
+---
+
+## 六、注意事项
+
+1. **构建需要访问 GitHub**:Higress、s6-overlay 等组件从 GitHub 下载,网络不通时需配置代理:
+   ```bash
+   export HTTPS_PROXY=http://your-proxy:port
+   ```
+
+2. **磁盘空间**:构建过程需要约 20GB 空间(含构建缓存)。
+
+3. **NVIDIA GPU 支持**:需提前安装 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html),Worker 启动时添加:
+   ```bash
+   docker run -d \
+       --name gpustack-worker \
+       --restart unless-stopped \
+       --gpus all \
+       --ulimit nofile=65535:65535 \
+       -v gpustack-worker-data:/var/lib/gpustack \
+       gpustack/gpustack:my-build \
+       --server-url http://<Server-IP> \
+       --token <Token>
+   ```
+
+4. **端口占用**:确保 Server 机器的 80 端口未被占用,Worker 机器的 40000-40063 端口(推理服务端口)未被占用。

+ 187 - 0
docs/deployment-linux-docker.md

@@ -0,0 +1,187 @@
+# Linux Docker 部署指南
+
+## 前置要求
+
+- Linux(Ubuntu 22.04+ 或 CentOS 8+)
+- Docker 24.0+
+- Docker Compose v2.20+
+
+安装 Docker:
+```bash
+curl -fsSL https://get.docker.com | sh
+systemctl enable --now docker
+```
+
+---
+
+## 一、基础部署(含内置 PostgreSQL)
+
+适合快速上手,所有组件运行在同一台机器。
+
+### 1. 进入 docker-compose 目录
+
+```bash
+cd /path/to/maas-base/docker-compose
+```
+
+### 2. 启动服务
+
+```bash
+docker compose -f docker-compose.server.yaml up -d
+```
+
+### 3. 查看初始管理员密码
+
+```bash
+docker exec gpustack-server cat /var/lib/gpustack/initial_admin_password
+```
+
+### 4. 访问
+
+浏览器打开 `http://<服务器IP>`,使用 `admin` 和上一步获取的密码登录。
+
+---
+
+## 二、使用外部 PostgreSQL
+
+如果已有 PostgreSQL 实例,通过环境变量指定连接地址。
+
+### 1. 创建 `.env` 文件
+
+```bash
+cat > .env <<EOF
+POSTGRES_PASSWORD=your_strong_password
+EOF
+```
+
+### 2. 修改 `docker-compose.server.yaml` 中的数据库配置
+
+将 `postgres` 服务替换为外部数据库连接:
+
+```yaml
+environment:
+  GPUSTACK_DATABASE_URL: postgresql://gpustack:your_password@your_db_host:5432/gpustack
+```
+
+并删除 `postgres` 服务和 `postgres-data` volume。
+
+确保外部数据库已执行授权:
+```sql
+GRANT CREATE ON SCHEMA public TO gpustack;
+GRANT ALL PRIVILEGES ON DATABASE gpustack TO gpustack;
+```
+
+### 3. 启动
+
+```bash
+docker compose -f docker-compose.server.yaml up -d
+```
+
+---
+
+## 三、含外部监控部署(Prometheus + Grafana)
+
+适合需要独立监控面板的场景。
+
+### 1. 创建 `.env` 文件
+
+```bash
+cat > .env <<EOF
+POSTGRES_PASSWORD=your_strong_password
+GRAFANA_PASSWORD=your_grafana_password
+GPUSTACK_GRAFANA_URL=http://<服务器IP>:3000
+EOF
+```
+
+> `GPUSTACK_GRAFANA_URL` 必须是浏览器可访问的地址(不能是容器内部地址)。
+
+### 2. 启动
+
+```bash
+docker compose -f docker-compose.external-observability.yaml up -d
+```
+
+### 3. 访问
+
+| 服务 | 地址 | 默认账号 |
+|------|------|----------|
+| GPUStack | `http://<IP>:80` | admin / 见 initial_admin_password |
+| Grafana | `http://<IP>:3000` | admin / 见 .env |
+| Prometheus | `http://<IP>:9090` | - |
+
+---
+
+## 四、常用运维命令
+
+```bash
+# 查看服务状态
+docker compose -f docker-compose.server.yaml ps
+
+# 查看日志
+docker logs -f gpustack-server
+
+# 停止服务
+docker compose -f docker-compose.server.yaml down
+
+# 停止并删除数据(危险)
+docker compose -f docker-compose.server.yaml down -v
+
+# 更新镜像
+docker compose -f docker-compose.server.yaml pull
+docker compose -f docker-compose.server.yaml up -d
+```
+
+---
+
+## 五、GPU 支持
+
+### NVIDIA GPU
+
+需要安装 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html):
+
+```bash
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+  sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+  tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+apt-get update && apt-get install -y nvidia-container-toolkit
+nvidia-ctk runtime configure --runtime=docker
+systemctl restart docker
+```
+
+在 `docker-compose.server.yaml` 的 `gpustack-server` 服务中添加:
+
+```yaml
+deploy:
+  resources:
+    reservations:
+      devices:
+        - driver: nvidia
+          count: all
+          capabilities: [gpu]
+```
+
+### AMD GPU
+
+在 `gpustack-server` 服务中添加:
+
+```yaml
+devices:
+  - /dev/kfd:/dev/kfd
+  - /dev/dri:/dev/dri
+group_add:
+  - video
+```
+
+---
+
+## 六、注意事项
+
+1. **端口冲突**:确保 80、5432、9090、3000 端口未被占用。
+2. **防火墙**:开放对应端口:
+   ```bash
+   ufw allow 80/tcp
+   ufw allow 3000/tcp   # Grafana(如需外部访问)
+   ```
+3. **数据持久化**:数据存储在 Docker volume 中,删除容器不会丢失数据,但 `down -v` 会清除所有数据。
+4. **生产环境**:建议修改 `.env` 中的所有默认密码。

+ 212 - 0
docs/deployment-linux.md

@@ -0,0 +1,212 @@
+# Linux 部署指南
+
+## 环境要求
+
+- OS:Ubuntu 22.04 / 24.04(推荐)或 CentOS 8+
+- Python:3.11
+- PostgreSQL:16+(外部实例或自建)
+- uv:Python 包管理器
+
+---
+
+## 一、安装依赖
+
+```bash
+# 系统依赖
+apt-get update && apt-get install -y \
+    python3.11 python3.11-venv python3.11-dev \
+    build-essential libssl-dev libffi-dev \
+    libpq-dev git curl
+
+# 安装 uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source $HOME/.local/bin/env
+```
+
+---
+
+## 二、准备代码
+
+```bash
+git clone <your-repo-url> /opt/gpustack
+cd /opt/gpustack
+```
+
+---
+
+## 三、创建虚拟环境并安装依赖
+
+```bash
+cd /opt/gpustack
+uv venv .venv --python 3.11
+source .venv/bin/activate
+uv pip install -e .
+```
+
+---
+
+## 四、准备数据库
+
+使用已有 PostgreSQL 实例,确保数据库和用户已创建并授权:
+
+```sql
+CREATE USER gpustack WITH PASSWORD 'your_password';
+CREATE DATABASE gpustack OWNER gpustack;
+GRANT ALL ON SCHEMA public TO gpustack;
+GRANT ALL PRIVILEGES ON DATABASE gpustack TO gpustack;
+```
+
+---
+
+## 五、准备前端 UI
+
+将前端构建产物放到 `/opt/gpustack/gpustack/ui/`:
+
+```bash
+# 在前端项目目录执行构建
+cd /path/to/maas-base-ui
+npm install && npm run build
+
+# 将 dist 目录复制为 ui 目录
+cp -r dist /opt/gpustack/gpustack/ui
+```
+
+---
+
+## 六、启动服务
+
+### 开发/测试模式(禁用 Gateway,直接暴露 API)
+
+```bash
+cd /opt/gpustack
+source .venv/bin/activate
+
+gpustack start \
+  --database-url "postgresql://gpustack:your_password@db_host:5432/gpustack" \
+  --gateway-mode disabled \
+  --api-port 80 \
+  --debug
+```
+
+### 生产模式(含 Higress Gateway,推荐 Docker)
+
+见下方 Docker 部署。
+
+---
+
+## 七、Docker 部署(生产推荐)
+
+项目提供了完整的 Docker 镜像,内置 PostgreSQL、Higress、Prometheus、Grafana。
+
+### 使用外部 PostgreSQL
+
+```bash
+docker run -d \
+  --name gpustack \
+  --restart unless-stopped \
+  -p 80:80 \
+  -v gpustack-data:/var/lib/gpustack \
+  -e GPUSTACK_DATABASE_URL="postgresql://gpustack:your_password@db_host:5432/gpustack" \
+  gpustack/gpustack:latest
+```
+
+### 使用内置 PostgreSQL
+
+```bash
+docker run -d \
+  --name gpustack \
+  --restart unless-stopped \
+  -p 80:80 \
+  -v gpustack-data:/var/lib/gpustack \
+  gpustack/gpustack:latest
+```
+
+### 查看初始管理员密码
+
+```bash
+docker exec gpustack cat /var/lib/gpustack/initial_admin_password
+```
+
+---
+
+## 八、Systemd 服务(裸机部署)
+
+创建 `/etc/systemd/system/gpustack.service`:
+
+```ini
+[Unit]
+Description=GPUStack Server
+After=network.target postgresql.service
+
+[Service]
+Type=simple
+User=root
+WorkingDirectory=/opt/gpustack
+Environment="PATH=/opt/gpustack/.venv/bin:/usr/local/bin:/usr/bin:/bin"
+ExecStart=/opt/gpustack/.venv/bin/gpustack start \
+    --database-url postgresql://gpustack:your_password@db_host:5432/gpustack \
+    --gateway-mode disabled \
+    --api-port 80
+Restart=on-failure
+RestartSec=5
+
+[Install]
+WantedBy=multi-user.target
+```
+
+```bash
+systemctl daemon-reload
+systemctl enable gpustack
+systemctl start gpustack
+systemctl status gpustack
+```
+
+---
+
+## 九、常用参数说明
+
+| 参数 | 说明 | 默认值 |
+|------|------|--------|
+| `--database-url` | PostgreSQL 连接 URL | 内置 SQLite |
+| `--gateway-mode` | Gateway 模式:`embedded`/`disabled` | `auto` |
+| `--api-port` | API 服务端口 | `30080` |
+| `--port` | Gateway 对外端口(embedded 模式) | `80` |
+| `--debug` | 开启调试日志 | `false` |
+| `--data-dir` | 数据目录 | `~/.local/share/gpustack` |
+| `--bootstrap-password` | 初始管理员密码 | 随机生成 |
+
+---
+
+## 十、验证部署
+
+```bash
+# 检查 API 是否正常
+curl http://localhost:80/v2/users/me
+
+# 查看 API 文档
+open http://localhost:80/docs
+```
+
+---
+
+## 注意事项
+
+1. **80 端口权限**:Linux 下监听 1024 以下端口需要 root 权限,或使用 `setcap`:
+   ```bash
+   setcap 'cap_net_bind_service=+ep' /opt/gpustack/.venv/bin/python3.11
+   ```
+
+2. **Gateway 模式**:`embedded` 模式需要 Higress 组件(仅 Docker 镜像内置),裸机部署建议使用 `--gateway-mode disabled`,通过 Nginx 反向代理到 `api-port`。
+
+3. **Nginx 反向代理示例**:
+   ```nginx
+   server {
+       listen 80;
+       location / {
+           proxy_pass http://127.0.0.1:30080;
+           proxy_set_header Host $host;
+           proxy_set_header X-Real-IP $remote_addr;
+           proxy_read_timeout 300s;
+       }
+   }
+   ```

+ 17 - 0
gpustack/cmd/start.py

@@ -676,12 +676,29 @@ def run(args: argparse.Namespace):
         if cfg.server_url:
         if cfg.server_url:
             run_worker(cfg)
             run_worker(cfg)
         else:
         else:
+            check_database_available(cfg)
             run_server(cfg)
             run_server(cfg)
     except Exception as e:
     except Exception as e:
         logger.exception(e)
         logger.exception(e)
         sys.exit(1)
         sys.exit(1)
 
 
 
 
+def check_database_available(cfg):
+    """Check if the database is reachable before starting the server."""
+    from gpustack.utils.db import test_db_connection
+    db_url = cfg.get_database_url()
+    if not test_db_connection(db_url):
+        logger.error(
+            f"Cannot connect to database at {db_url}. "
+            "GPUStack requires a PostgreSQL database. "
+            "You can start one quickly with Docker:\n"
+            "  docker run -d --name gpustack-db "
+            "-e POSTGRES_USER=root -e POSTGRES_HOST_AUTH_METHOD=trust "
+            "-e POSTGRES_DB=gpustack -p 5432:5432 postgres:16"
+        )
+        sys.exit(1)
+
+
 def run_server(cfg: Config):
 def run_server(cfg: Config):
     server = Server(
     server = Server(
         config=cfg,
         config=cfg,

+ 1 - 1
gpustack/config/config.py

@@ -626,7 +626,7 @@ class Config(WorkerConfig, BaseSettings):
 
 
         if not self.database_url.startswith(
         if not self.database_url.startswith(
             "postgresql://"
             "postgresql://"
-        ) and not self.database_url.startswith("mysql://"):
+        ) and not self.database_url.startswith("mysql://") and not self.database_url.startswith("sqlite"):
             raise Exception(
             raise Exception(
                 "Unsupported database scheme. Supported databases are postgresql, and mysql."
                 "Unsupported database scheme. Supported databases are postgresql, and mysql."
             )
             )

+ 13 - 3
gpustack/gateway/__init__.py

@@ -78,9 +78,15 @@ def init_async_k8s_config(cfg: Config):
         )
         )
         cfg_loader.load_and_set(configuration)
         cfg_loader.load_and_set(configuration)
     else:
     else:
-        cfg_loader = KubeConfigLoader(
-            config_dict=KubeConfigMerger(cfg.gateway_kubeconfig).config
-        )
+        kubeconfig_path = cfg.gateway_kubeconfig
+        if not kubeconfig_path or not os.path.isfile(kubeconfig_path):
+            logger.debug(f"Kubeconfig not found at {kubeconfig_path}, skipping k8s config initialization")
+            return
+        config_dict = KubeConfigMerger(cfg.gateway_kubeconfig).config
+        if not config_dict or not config_dict.get("current-context"):
+            logger.debug(f"Kubeconfig at {kubeconfig_path} is empty or missing current-context, skipping k8s config initialization")
+            return
+        cfg_loader = KubeConfigLoader(config_dict=config_dict)
         if not cfg_loader._load_user_token():
         if not cfg_loader._load_user_token():
             cfg_loader._load_user_pass_token()
             cfg_loader._load_user_pass_token()
         cfg_loader._load_cluster_info()
         cfg_loader._load_cluster_info()
@@ -778,6 +784,10 @@ def initialize_gateway(cfg: Config, timeout: int = 60, interval: int = 5):
     if cfg.gateway_mode == GatewayModeEnum.disabled:
     if cfg.gateway_mode == GatewayModeEnum.disabled:
         return
         return
     init_async_k8s_config(cfg=cfg)
     init_async_k8s_config(cfg=cfg)
+    # If k8s config couldn't be initialized (e.g., no valid kubeconfig), skip gateway setup
+    if async_gateway_config is None:
+        logger.warning("Gateway k8s config could not be initialized, skipping gateway setup")
+        return
     wait_for_apiserver_ready(cfg=cfg, timeout=timeout, interval=interval)
     wait_for_apiserver_ready(cfg=cfg, timeout=timeout, interval=interval)
     if cfg.gateway_mode in [
     if cfg.gateway_mode in [
         GatewayModeEnum.embedded,
         GatewayModeEnum.embedded,

+ 11 - 3
gpustack/gateway/utils.py

@@ -942,6 +942,8 @@ async def cleanup_model_mapper(
     config: k8s_client.Configuration,
     config: k8s_client.Configuration,
     extra_labels: Optional[Dict[str, str]] = None,
     extra_labels: Optional[Dict[str, str]] = None,
 ):
 ):
+    if config is None:
+        return
     api = ExtensionsHigressIoV1Api(k8s_client.ApiClient(config))
     api = ExtensionsHigressIoV1Api(k8s_client.ApiClient(config))
     labels = copy.deepcopy(managed_labels)
     labels = copy.deepcopy(managed_labels)
     if extra_labels:
     if extra_labels:
@@ -978,6 +980,8 @@ async def cleanup_ingresses(
     cleanup_prefix: str,
     cleanup_prefix: str,
     reason: str = "orphaned",
     reason: str = "orphaned",
 ):
 ):
+    if config is None:
+        return
     networking_api = k8s_client.NetworkingV1Api(k8s_client.ApiClient(config))
     networking_api = k8s_client.NetworkingV1Api(k8s_client.ApiClient(config))
     try:
     try:
         # Use label selector to filter only managed ingresses
         # Use label selector to filter only managed ingresses
@@ -1126,9 +1130,7 @@ async def cleanup_fallback_filters(
 ):
 ):
     if networking_istio_api is None:
     if networking_istio_api is None:
         if k8s_config is None:
         if k8s_config is None:
-            raise ValueError(
-                "Either networking_istio_api or k8s_config must be provided."
-            )
+            return
         networking_istio_api = NetworkingIstioIoV1Alpha3Api(
         networking_istio_api = NetworkingIstioIoV1Alpha3Api(
             k8s_client.ApiClient(k8s_config)
             k8s_client.ApiClient(k8s_config)
         )
         )
@@ -1274,6 +1276,8 @@ async def cleanup_ai_proxy_config(
     k8s_config: k8s_client.Configuration,
     k8s_config: k8s_client.Configuration,
     namespace: str,
     namespace: str,
 ):
 ):
+    if k8s_config is None:
+        return
     prefixes_to_keep = {model_route_cleanup_prefix(route.id) for route in routes}
     prefixes_to_keep = {model_route_cleanup_prefix(route.id) for route in routes}
     prefixes_to_keep.update(
     prefixes_to_keep.update(
         {provider_registry_name(provider.id) for provider in providers}
         {provider_registry_name(provider.id) for provider in providers}
@@ -1490,6 +1494,8 @@ async def cleanup_generic_route_transformer(
     namespace: str,
     namespace: str,
 ):
 ):
     """Prune generic-route transformer rules to those for existing generic_proxy routes."""
     """Prune generic-route transformer rules to those for existing generic_proxy routes."""
+    if k8s_config is None:
+        return
     expected_patterns = {
     expected_patterns = {
         build_generic_route_path_pattern(route.id)
         build_generic_route_path_pattern(route.id)
         for route in routes
         for route in routes
@@ -1514,6 +1520,8 @@ async def cleanup_mcpbridge_registry(
     namespace: str,
     namespace: str,
     k8s_config: k8s_client.Configuration,
     k8s_config: k8s_client.Configuration,
 ):
 ):
+    if k8s_config is None:
+        return
     worker_by_id = {worker.id: worker for worker in workers}
     worker_by_id = {worker.id: worker for worker in workers}
     networking_higress_api = NetworkingHigressIoV1Api(k8s_client.ApiClient(k8s_config))
     networking_higress_api = NetworkingHigressIoV1Api(k8s_client.ApiClient(k8s_config))
     # cleanup providers
     # cleanup providers

+ 1 - 1
gpustack/routes/ui.py

@@ -7,7 +7,7 @@ from fastapi.staticfiles import StaticFiles
 def register(app: FastAPI):
 def register(app: FastAPI):
     ui_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "ui")
     ui_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "ui")
     if not os.path.isdir(ui_dir):
     if not os.path.isdir(ui_dir):
-        raise RuntimeError(f"directory '{ui_dir}' does not exist")
+        return
 
 
     for name in ["css", "js", "static"]:
     for name in ["css", "js", "static"]:
         app.mount(
         app.mount(

+ 3 - 0
gpustack/server/init_db.py

@@ -90,6 +90,9 @@ async def init_db_engine(db_url: str):
 
 
     elif db_url.startswith("mysql://"):
     elif db_url.startswith("mysql://"):
         db_url = re.sub(r'^mysql://', 'mysql+asyncmy://', db_url)
         db_url = re.sub(r'^mysql://', 'mysql+asyncmy://', db_url)
+    elif db_url.startswith("sqlite"):
+        # Convert sqlite:// to sqlite+aiosqlite://
+        db_url = re.sub(r'^sqlite(\+aiosqlite)?://', 'sqlite+aiosqlite://', db_url)
     else:
     else:
         raise Exception(f"Unsupported database URL: {db_url}")
         raise Exception(f"Unsupported database URL: {db_url}")
 
 

+ 17 - 0
gpustack/utils/db.py

@@ -2,11 +2,28 @@
 
 
 import re
 import re
 
 
+from sqlalchemy import create_engine, text
 from sqlalchemy.dialects.postgresql import base as pg_base
 from sqlalchemy.dialects.postgresql import base as pg_base
 
 
 _pg_version_patched = False
 _pg_version_patched = False
 
 
 
 
+def test_db_connection(db_url: str, timeout: int = 5) -> bool:
+    """Test if a database connection can be established."""
+    # For async URLs, convert to sync for the test
+    if db_url.startswith("sqlite"):
+        # SQLite doesn't need a pre-connection test, the file will be created
+        return True
+    try:
+        engine = create_engine(db_url, connect_args={"connect_timeout": timeout})
+        with engine.connect() as conn:
+            conn.execute(text("SELECT 1"))
+        engine.dispose()
+        return True
+    except Exception:
+        return False
+
+
 def patch_pg_version_info() -> None:
 def patch_pg_version_info() -> None:
     """Teach SQLAlchemy's PGDialect to parse openGauss version strings.
     """Teach SQLAlchemy's PGDialect to parse openGauss version strings.
 
 

+ 70 - 8
gpustack/utils/envs.py

@@ -1,4 +1,5 @@
 import os
 import os
+import platform
 import subprocess
 import subprocess
 from pathlib import Path
 from pathlib import Path
 from typing import Dict, List, Optional
 from typing import Dict, List, Optional
@@ -6,17 +7,32 @@ from typing import Dict, List, Optional
 
 
 def extract_unix_vars_of_source(script_paths: List[Path]) -> Dict[str, str]:
 def extract_unix_vars_of_source(script_paths: List[Path]) -> Dict[str, str]:
     """
     """
-    Extracts the environment variables from a source-able script on *unix.
-    Needs to be sourced in a bash shell.
+    Extracts the environment variables from a source-able script on Unix.
+    On Windows, uses PowerShell to source the script and capture env changes.
     """
     """
-    # Assume the script exists and is executable
     for script_path in script_paths:
     for script_path in script_paths:
         if not script_path.is_file():
         if not script_path.is_file():
             raise Exception(
             raise Exception(
                 f"The file '{script_path}' does not exist or is not a file."
                 f"The file '{script_path}' does not exist or is not a file."
             )
             )
 
 
-    # Parse the result output of executing "env"
+    def parse_env(env_str):
+        env = {}
+        for line in env_str.splitlines():
+            if '=' in line:
+                key, value = line.split('=', 1)
+                env[key] = value
+        return env
+
+    system = platform.system().lower()
+
+    if system == "windows":
+        return _extract_env_via_powershell(script_paths)
+    else:
+        return _extract_env_via_bash(script_paths)
+
+
+def _extract_env_via_bash(script_paths: List[Path]) -> Dict[str, str]:
     def parse_env(env_str):
     def parse_env(env_str):
         env = {}
         env = {}
         for line in env_str.splitlines():
         for line in env_str.splitlines():
@@ -26,7 +42,6 @@ def extract_unix_vars_of_source(script_paths: List[Path]) -> Dict[str, str]:
         return env
         return env
 
 
     try:
     try:
-        # Get original environment variables
         original_env_output = subprocess.check_output(
         original_env_output = subprocess.check_output(
             ['bash', '-c', 'env'],
             ['bash', '-c', 'env'],
             stderr=subprocess.PIPE,
             stderr=subprocess.PIPE,
@@ -34,12 +49,10 @@ def extract_unix_vars_of_source(script_paths: List[Path]) -> Dict[str, str]:
         )
         )
         original = parse_env(original_env_output)
         original = parse_env(original_env_output)
 
 
-        # Merge all sourcing script paths in to one command
         source_command = ' && '.join(
         source_command = ' && '.join(
             [f'source {script_path}' for script_path in script_paths]
             [f'source {script_path}' for script_path in script_paths]
         )
         )
 
 
-        # Get the environment variables after sourcing the script
         sourced_env_output = subprocess.check_output(
         sourced_env_output = subprocess.check_output(
             ['bash', '-c', f'{source_command} && env'],
             ['bash', '-c', f'{source_command} && env'],
             stderr=subprocess.PIPE,
             stderr=subprocess.PIPE,
@@ -47,7 +60,6 @@ def extract_unix_vars_of_source(script_paths: List[Path]) -> Dict[str, str]:
         )
         )
         sourced = parse_env(sourced_env_output)
         sourced = parse_env(sourced_env_output)
 
 
-        # Get the difference
         diff = {
         diff = {
             k: v
             k: v
             for k, v in sourced.items()
             for k, v in sourced.items()
@@ -61,6 +73,56 @@ def extract_unix_vars_of_source(script_paths: List[Path]) -> Dict[str, str]:
         )
         )
 
 
 
 
+def _extract_env_via_powershell(script_paths: List[Path]) -> Dict[str, str]:
+    """
+    Uses PowerShell to source a script (e.g. .ps1 or env-setup script)
+    and capture environment variable changes.
+    """
+    try:
+        # Build a PowerShell command that captures env before and after sourcing
+        script_args = ' '.join(
+            [f'"{str(p)}"' for p in script_paths]
+        )
+        ps_command = (
+            '$before = Get-ChildItem Env: | ForEach-Object { "$($_.Name)=$($_.Value)" }; '
+            f'{script_args}; '
+            '$after = Get-ChildItem Env: | ForEach-Object { "$($_.Name)=$($_.Value)" }; '
+            'Write-Output "---BEFORE---"; '
+            'Write-Output $before; '
+            'Write-Output "---AFTER---"; '
+            'Write-Output $after'
+        )
+
+        output = subprocess.check_output(
+            ['powershell', '-NoProfile', '-NonInteractive', '-Command', ps_command],
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+
+        # Split output into before and after sections
+        parts = output.split('---BEFORE---')
+        if len(parts) < 2:
+            return {}
+        after_section = parts[1].split('---AFTER---')
+        before_str = after_section[0]
+        after_str = after_section[1] if len(after_section) > 1 else ''
+
+        before = parse_env(before_str)
+        after = parse_env(after_str)
+
+        diff = {
+            k: v
+            for k, v in after.items()
+            if k not in before or before.get(k) != v
+        }
+
+        return diff
+    except subprocess.CalledProcessError as e:
+        raise Exception(
+            f"Failed to extract environment variables from [{script_paths}] via PowerShell: {e.stderr}"
+        )
+
+
 def get_gpustack_env(env_var: str) -> Optional[str]:
 def get_gpustack_env(env_var: str) -> Optional[str]:
     env_name = "GPUSTACK_" + env_var
     env_name = "GPUSTACK_" + env_var
     return os.getenv(env_name)
     return os.getenv(env_name)

+ 14 - 2
gpustack/utils/locks.py

@@ -1,15 +1,22 @@
 import logging
 import logging
 import os
 import os
+import platform
 import time
 import time
 import threading
 import threading
 import json
 import json
 import socket
 import socket
 from typing import Optional
 from typing import Optional
 from filelock import SoftFileLock, Timeout
 from filelock import SoftFileLock, Timeout
-import fcntl
-import errno
 from modelscope.hub.utils.utils import model_id_to_group_owner_name
 from modelscope.hub.utils.utils import model_id_to_group_owner_name
 
 
+_HAS_FCNTL = False
+try:
+    import fcntl
+    import errno
+    _HAS_FCNTL = True
+except ModuleNotFoundError:
+    pass
+
 from gpustack.envs import DISABLE_OS_FILELOCK
 from gpustack.envs import DISABLE_OS_FILELOCK
 from gpustack.schemas import ModelFile
 from gpustack.schemas import ModelFile
 from gpustack.schemas.models import SourceEnum
 from gpustack.schemas.models import SourceEnum
@@ -121,6 +128,11 @@ class HeartbeatSoftFileLock:
             self._release_os_lock()
             self._release_os_lock()
 
 
     def _acquire_os_lock(self):
     def _acquire_os_lock(self):
+        if not _HAS_FCNTL:
+            # Windows doesn't support fcntl — fall back to soft lock
+            self._using_soft_lock = True
+            return
+
         dirpath = os.path.dirname(self._lock_path)
         dirpath = os.path.dirname(self._lock_path)
         if dirpath:
         if dirpath:
             os.makedirs(dirpath, exist_ok=True)
             os.makedirs(dirpath, exist_ok=True)

+ 16 - 1
gpustack/utils/uuid.py

@@ -37,13 +37,28 @@ def get_system_uuid() -> str:
             for line in output.decode().split('\n'):
             for line in output.decode().split('\n'):
                 if 'IOPlatformUUID' in line:
                 if 'IOPlatformUUID' in line:
                     return line.split('=')[-1].strip().strip('"')
                     return line.split('=')[-1].strip().strip('"')
-        elif system == 'win32':
+        elif sys.platform == 'win32':
+            # Try PowerShell first (works on Win11 24H2+ where wmic is removed)
+            try:
+                output = subprocess.check_output(
+                    ['powershell', '-NoProfile', '-NonInteractive',
+                     '-Command', '(Get-CimInstance Win32_ComputerSystemProduct).UUID'],
+                    stderr=subprocess.DEVNULL,
+                    text=True,
+                )
+                result = output.strip()
+                if result:
+                    return result
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                pass
+            # Fallback to wmic for older Windows versions
             output = subprocess.check_output(
             output = subprocess.check_output(
                 ['wmic', 'csproduct', 'get', 'uuid'], stderr=subprocess.DEVNULL
                 ['wmic', 'csproduct', 'get', 'uuid'], stderr=subprocess.DEVNULL
             )
             )
             lines = output.decode().split('\n')
             lines = output.decode().split('\n')
             if len(lines) > 1:
             if len(lines) > 1:
                 return lines[1].strip()
                 return lines[1].strip()
+            raise RuntimeError("Unable to retrieve Windows UUID")
         else:
         else:
             raise RuntimeError(f"Not supported OS or unable to retrieve {system} UUID")
             raise RuntimeError(f"Not supported OS or unable to retrieve {system} UUID")
     except Exception as e:
     except Exception as e:

Fichier diff supprimé car celui-ci est trop grand
+ 429 - 429
uv.lock


Certains fichiers n'ont pas été affichés car il y a eu trop de fichiers modifiés dans ce diff