|
@@ -104,14 +104,12 @@ class InferenceWorker:
|
|
|
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
|
|
|
|
|
|
print(f"[worker] Loading model from: {model_path}", flush=True)
|
|
print(f"[worker] Loading model from: {model_path}", flush=True)
|
|
|
|
|
+ # 单卡加载,与训练 DDP 模式一致:每个进程只用一张 GPU
|
|
|
|
|
+ # 避免 device_map="auto" 拆分模型导致 rotary_emb 等共享模块跨卡报错
|
|
|
|
|
+ # CUDA_VISIBLE_DEVICES 由启动脚本设置,cuda:0 就是第一张可见 GPU
|
|
|
if torch.cuda.is_available():
|
|
if torch.cuda.is_available():
|
|
|
- num_gpus = torch.cuda.device_count()
|
|
|
|
|
- if num_gpus > 1:
|
|
|
|
|
- device_map = self._build_device_map(model_path, num_gpus)
|
|
|
|
|
- print(f"[worker] Multi-GPU device_map ({num_gpus} GPUs): {device_map}", flush=True)
|
|
|
|
|
- else:
|
|
|
|
|
- device_map = {"": 0}
|
|
|
|
|
- print(f"[worker] Single GPU device_map: cuda:0", flush=True)
|
|
|
|
|
|
|
+ device_map = {"": 0}
|
|
|
|
|
+ print(f"[worker] Single GPU device_map: cuda:0", flush=True)
|
|
|
else:
|
|
else:
|
|
|
device_map = "cpu"
|
|
device_map = "cpu"
|
|
|
print("[worker] CPU device_map", flush=True)
|
|
print("[worker] CPU device_map", flush=True)
|
|
@@ -122,43 +120,6 @@ class InferenceWorker:
|
|
|
self.torch = torch
|
|
self.torch = torch
|
|
|
print("[worker] Model loaded successfully.", flush=True)
|
|
print("[worker] Model loaded successfully.", flush=True)
|
|
|
|
|
|
|
|
- @staticmethod
|
|
|
|
|
- def _build_device_map(model_path: str, num_gpus: int) -> dict:
|
|
|
|
|
- """构建多卡 device_map,确保 tied weights 在同一张卡上。
|
|
|
|
|
-
|
|
|
|
|
- HuggingFace 的 device_map="auto" 有时无法正确处理 tied weights
|
|
|
|
|
- (embed_tokens 和 lm_head 共享权重),导致它们被分到不同 GPU。
|
|
|
|
|
- 这里手动构建映射,将 tied weights 强制放在同一张卡。
|
|
|
|
|
- """
|
|
|
|
|
- from transformers import AutoConfig
|
|
|
|
|
-
|
|
|
|
|
- config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
|
|
|
|
- num_layers = getattr(config, "num_hidden_layers", None)
|
|
|
|
|
- if num_layers is None:
|
|
|
|
|
- return "auto"
|
|
|
|
|
-
|
|
|
|
|
- layers_per_gpu = num_layers // num_gpus
|
|
|
|
|
- remainder = num_layers % num_gpus
|
|
|
|
|
- device_map = {}
|
|
|
|
|
- layer_idx = 0
|
|
|
|
|
- for gpu in range(num_gpus):
|
|
|
|
|
- count = layers_per_gpu + (1 if gpu < remainder else 0)
|
|
|
|
|
- for _ in range(count):
|
|
|
|
|
- device_map[f"model.layers.{layer_idx}"] = gpu
|
|
|
|
|
- layer_idx += 1
|
|
|
|
|
-
|
|
|
|
|
- # 核心:tied weights 强制放在同一张卡(第 0 张)
|
|
|
|
|
- # embed_tokens 和 lm_head 共享 Embedding 权重
|
|
|
|
|
- device_map["model.embed_tokens"] = 0
|
|
|
|
|
- device_map["model.norm"] = 0
|
|
|
|
|
- device_map["lm_head"] = 0
|
|
|
|
|
-
|
|
|
|
|
- # Qwen 等模型可能有 rotary_emb
|
|
|
|
|
- if hasattr(config, "rope_theta") or hasattr(config, "rotary_emb"):
|
|
|
|
|
- device_map["model.rotary_emb"] = 0
|
|
|
|
|
-
|
|
|
|
|
- return device_map
|
|
|
|
|
-
|
|
|
|
|
def generate(self, request: dict) -> dict:
|
|
def generate(self, request: dict) -> dict:
|
|
|
"""处理一次推理请求。"""
|
|
"""处理一次推理请求。"""
|
|
|
# 支持两种输入:messages(OpenAI 格式)或 prompt(原始文本)
|
|
# 支持两种输入:messages(OpenAI 格式)或 prompt(原始文本)
|