Ver código fonte

docker运行

lxylxy123321 1 semana atrás
pai
commit
f227a3996e

+ 8 - 0
.dockerignore

@@ -0,0 +1,8 @@
+node_modules/
+.git/
+.vscode/
+.idea/
+*.md
+Dockerfile
+docker-compose*.yml
+.dockerignore

+ 12 - 0
backend/.dockerignore

@@ -0,0 +1,12 @@
+__pycache__/
+*.pyc
+*.pyo
+*.egg-info/
+dist/
+build/
+*.egg
+.venv/
+venv/
+.env
+data/
+*.db

+ 35 - 0
backend/.env.docker

@@ -0,0 +1,35 @@
+# Docker 环境配置
+BACKEND_HOST=0.0.0.0
+BACKEND_PORT=8010
+BACKEND_ENV=production
+BACKEND_LOG_LEVEL=INFO
+BACKEND_CORS_ORIGINS=http://localhost
+
+# Docker 容器内数据库路径
+DATABASE_URL=sqlite+aiosqlite:///root/Fine-tuning/backend/data/finetuning.db
+
+# Docker 容器内数据目录
+DATA_DIR=/root/Fine-tuning/backend/data
+
+DEFAULT_PEFT_METHOD=lora
+DEFAULT_EPOCHS=3
+DEFAULT_BATCH_SIZE=4
+DEFAULT_GRADIENT_ACCUMULATION=4
+DEFAULT_LR=2e-4
+DEFAULT_MAX_SEQ_LENGTH=2048
+DEFAULT_WARMUP_RATIO=0.05
+DEFAULT_SAVE_STRATEGY=epoch
+DEFAULT_EVAL_STRATEGY=epoch
+DEFAULT_EVAL_STEPS=100
+
+LORA_R=16
+LORA_ALPHA=32
+LORA_DROPOUT=0.05
+LORA_TARGET_MODULES=all-linear
+
+QLORA_BITS=4
+QLORA_TYPE=nf4
+QLORA_DOUBLE_QUANT=true
+
+MAX_UPLOAD_SIZE_MB=500
+ALLOWED_DATASET_FORMATS=jsonl,csv,parquet,json

+ 30 - 0
backend/Dockerfile

@@ -0,0 +1,30 @@
+# 使用本地已有的沐曦 maca + PyTorch 2.8 + Python 3.10 镜像
+# 驱动版本 maca 3.5.3.502,PyTorch 2.8,兼容性好
+FROM cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py310-ubuntu22.04-amd64
+
+WORKDIR /app
+
+# 设置 conda Python 路径(镜像使用 /opt/conda)
+ENV PATH="/opt/conda/bin:$PATH"
+
+# 升级 pip
+RUN /opt/conda/bin/pip install --no-cache-dir --upgrade pip
+
+# 复制依赖文件并安装(跳过 torch,镜像已自带)
+COPY requirements.txt .
+RUN /opt/conda/bin/pip install --no-cache-dir -r requirements.txt
+
+# 复制应用代码
+COPY . .
+
+# 沐曦 maca 环境变量(镜像中通常已设置,这里显式声明)
+ENV MACA_PATH=/opt/maca
+ENV LD_LIBRARY_PATH=/opt/maca/lib:/opt/maca/mxgpu_llvm/lib:/opt/maca/ompi/lib:${LD_LIBRARY_PATH}
+ENV MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin
+
+EXPOSE 8010
+
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD /opt/conda/bin/python -c "import urllib.request; urllib.request.urlopen('http://localhost:8010/health')" || exit 1
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8010"]

+ 13 - 2
backend/app/services/model_service.py

@@ -155,10 +155,21 @@ async def delete_model(model_id: str) -> dict[str, Any]:
         if not record:
             return {"status": "not_found", "message": f"Model not found: {model_id}"}
 
-        # 删除本地文件目录
+        # 删除本地文件目录(对软链接,删除其指向的真实目录)
         model_dir = Path(record.path) if record.path else settings.models_dir / record.id.replace("/", "_")
         deleted_files = False
-        if model_dir.exists() and model_dir.is_dir():
+        if model_dir.is_symlink():
+            # ModelScope 下载的模型可能是软链接,删除真实目录
+            real_dir = model_dir.resolve()
+            import shutil
+            if real_dir.exists() and real_dir.is_dir():
+                shutil.rmtree(real_dir, ignore_errors=True)
+            # 如果还有父级软链接(如 dphn/ 下的其他链接),一并清理
+            parent_link = model_dir.parent
+            if parent_link.is_symlink():
+                shutil.rmtree(parent_link, ignore_errors=True)
+            deleted_files = True
+        elif model_dir.exists() and model_dir.is_dir():
             import shutil
             shutil.rmtree(model_dir, ignore_errors=True)
             deleted_files = True

+ 6 - 3
backend/requirements.txt

@@ -8,13 +8,16 @@ aiosqlite>=0.20.0
 alembic>=1.13.0
 python-multipart>=0.0.9
 websockets>=12.0
-torch>=2.4.0
+# GPU 相关依赖需要根据实际显卡手动安装:
+#   NVIDIA GPU: pip install torch --index-url https://download.pytorch.org/whl/cu121
+#   沐曦 GPU:    pip install torch_musa --index-url https://release.mthreads.com/repo/pypi/simple
+# 以下为框架和工具库依赖
 transformers>=4.45.0
 peft>=0.13.0
 trl>=0.12.0
 datasets>=3.0.0
-accelerate>=1.0.0
-bitsandbytes>=0.44.0
+# accelerate>=1.0.0          # 随 GPU 环境一起安装
+# bitsandbytes>=0.44.0        # 仅 NVIDIA GPU 需要,沐曦环境可能不兼容
 scipy>=1.14.0
 scikit-learn>=1.5.0
 pillow>=10.4.0

+ 49 - 0
docker-compose.yml

@@ -0,0 +1,49 @@
+version: "3.8"
+
+services:
+  backend:
+    build:
+      context: ./backend
+      dockerfile: Dockerfile
+    container_name: finetune-backend
+    restart: unless-stopped
+    ports:
+      - "8010:8010"
+    volumes:
+      # 持久化数据和模型
+      - ./data:/root/Fine-tuning/backend/data
+    env_file:
+      - ./backend/.env.docker
+    environment:
+      - BACKEND_HOST=0.0.0.0
+      - BACKEND_PORT=8010
+      - BACKEND_CORS_ORIGINS=http://localhost
+      # 沐曦 maca 环境变量
+      - MACA_PATH=/opt/maca
+      - LD_LIBRARY_PATH=/opt/maca/lib:/opt/maca/mxgpu_llvm/lib:/opt/maca/ompi/lib
+      - MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin
+    devices:
+      - /dev/mxcd:/dev/mxcd
+    privileged: true
+    networks:
+      - finetune-net
+
+  frontend:
+    build:
+      context: ./frontend
+      dockerfile: Dockerfile
+      args:
+        VITE_API_BASE_URL: /api/v1
+        VITE_WS_BASE_URL: /ws
+    container_name: finetune-frontend
+    restart: unless-stopped
+    ports:
+      - "80:80"
+    depends_on:
+      - backend
+    networks:
+      - finetune-net
+
+networks:
+  finetune-net:
+    driver: bridge

+ 7 - 0
frontend/.dockerignore

@@ -0,0 +1,7 @@
+node_modules/
+dist/
+build/
+.env
+.env.*
+*.log
+.DS_Store

+ 28 - 0
frontend/Dockerfile

@@ -0,0 +1,28 @@
+FROM node:20-alpine AS builder
+
+WORKDIR /app
+
+# 复制依赖定义
+COPY package.json ./
+RUN npm ci
+
+# 复制源码并构建
+COPY . .
+ARG VITE_API_BASE_URL=/api/v1
+ARG VITE_WS_BASE_URL=/ws
+ENV VITE_API_BASE_URL=$VITE_API_BASE_URL
+ENV VITE_WS_BASE_URL=$VITE_WS_BASE_URL
+RUN npm run build
+
+# 使用 Nginx 提供静态文件
+FROM nginx:alpine
+
+# 复制构建产物
+COPY --from=builder /app/dist /usr/share/nginx/html
+
+# Nginx 配置:反向代理 API 到后端
+COPY nginx.conf /etc/nginx/conf.d/default.conf
+
+EXPOSE 80
+
+CMD ["nginx", "-g", "daemon off;"]

+ 31 - 0
frontend/nginx.conf

@@ -0,0 +1,31 @@
+server {
+    listen 80;
+    server_name _;
+
+    # 前端静态文件
+    location / {
+        root /usr/share/nginx/html;
+        index index.html;
+        try_files $uri $uri/ /index.html;
+    }
+
+    # 反向代理 API 请求到后端
+    location /api/ {
+        proxy_pass http://backend:8010;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+    }
+
+    # WebSocket 代理
+    location /ws/ {
+        proxy_pass http://backend:8010;
+        proxy_http_version 1.1;
+        proxy_set_header Upgrade $http_upgrade;
+        proxy_set_header Connection "upgrade";
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+    }
+}

+ 230 - 0
result.txt

@@ -0,0 +1,230 @@
+(finetuning) [root@localhost backend]# IMG="cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py310-ubuntu22.04-amd64" && docker run --rm --device /dev/mxcd --privileged $IMG bash -c "ls /opt/conda/bin 2>/dev/null || ls /root/.conda/envs/*/bin/python3 2>/dev/null || find / -name 'torch' -type d 2>/dev/null | head -5"
+2to3
+2to3-3.10
+__pycache__
+activate
+adig
+ahost
+archspec
+bsdcat
+bsdcpio
+bsdtar
+bsdunzip
+bunzip2
+bzcat
+bzcmp
+bzdiff
+bzegrep
+bzfgrep
+bzgrep
+bzip2
+bzip2recover
+bzless
+bzmore
+c_rehash
+captoinfo
+cbor2
+cjpeg
+clear
+codecov
+coloredlogs
+compile_et
+conda
+conda-env
+conda2solv
+coverage
+coverage-3.10
+coverage3
+cph
+cpuinfo
+curl-config
+datasets-cli
+deactivate
+deep
+derb
+distro
+djpeg
+dotenv
+dumpsolv
+echo_supervisord_conf
+email_validator
+f2py
+fastapi
+flask
+genbrk
+gencfu
+gencnval
+gendict
+generate-supervisor-config
+genrb
+get_gprof
+get_objgraph
+gguf-convert-endian
+gguf-dump
+gguf-editor-gui
+gguf-new-metadata
+gguf-set-metadata
+gss-client
+hf
+httpx
+huggingface-cli
+humanfriendly
+hypercorn
+icu-config
+icuexportdata
+icuinfo
+idle3
+idle3.10
+infocmp
+infotocap
+installcheck
+isympy
+jp.py
+jpegtran
+json-playground
+jsondiff
+jsonpatch
+jsonpointer
+jsonschema
+k5srvutil
+kadmin
+kdestroy
+keyctl
+kinit
+klist
+kpasswd
+krb5-config
+ksu
+kswitch
+ktutil
+kvno
+lmcache
+lmcache_controller
+lmcache_server
+lmcache_v0_server
+lz4
+lz4c
+lz4cat
+lzcat
+lzcmp
+lzdiff
+lzegrep
+lzfgrep
+lzgrep
+lzless
+lzma
+lzmadec
+lzmainfo
+lzmore
+makeconv
+mamba
+mamba-package
+markdown-it
+mc_store_rest_server
+mcoplib_version
+mcp
+mergesolv
+mistral_common
+mooncake_http_metadata_server
+mooncake_master
+ncurses6-config
+ncursesw6-config
+nghttp
+nghttpd
+nghttpx
+ninja
+normalizer
+numba
+openai
+openssl
+pidproxy
+pip
+pip3
+pkgdata
+py.test
+pybase64
+pybind11-config
+pydoc
+pydoc3
+pydoc3.10
+pygmentize
+pytest
+python
+python3
+python3-config
+python3.1
+python3.10
+python3.10-config
+quart
+ray
+rdjpgcom
+repo2solv
+reset
+sclient
+serve
+setuptools-scm
+sim_client
+sqlite3_analyzer
+standard-supervisor
+supervisorctl
+supervisord
+tabs
+tclsh
+tclsh8.6
+testsolv
+tic
+tiny-agents
+tjbench
+toe
+torchfrtrace
+torchrun
+tox
+tput
+tqdm
+transfer_engine_bench
+transfer_engine_topology_dump
+transformers
+transformers-cli
+tset
+tune
+tvm-ffi-config
+tvm-ffi-stubgen
+typer
+undill
+unlz4
+unlzma
+unxz
+unzstd
+uuclient
+uvicorn
+vcs-versioning
+virtualenv
+vllm
+vllm_collect_env
+watchfiles
+websockets
+wheel
+wish
+wish8.6
+wrjpgcom
+x86_64-conda-linux-gnu-ld
+x86_64-conda_cos6-linux-gnu-ld
+xml2-config
+xmlcatalog
+xmllint
+xz
+xzcat
+xzcmp
+xzdec
+xzdiff
+xzegrep
+xzfgrep
+xzgrep
+xzless
+xzmore
+zstd
+zstdcat
+zstdgrep
+zstdless
+zstdmt
+(finetuning) [root@localhost backend]#