|
@@ -104,12 +104,17 @@ class InferenceWorker:
|
|
|
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
|
|
|
|
|
|
print(f"[worker] Loading model from: {model_path}", flush=True)
|
|
print(f"[worker] Loading model from: {model_path}", flush=True)
|
|
|
- # 单卡加载到 cuda:0(CUDA_VISIBLE_DEVICES 已限制可见 GPU)
|
|
|
|
|
- # 不使用 device_map="auto" 避免多卡时 tied weights 分到不同 GPU 导致报错
|
|
|
|
|
if torch.cuda.is_available():
|
|
if torch.cuda.is_available():
|
|
|
- device_map = {"": 1}
|
|
|
|
|
|
|
+ num_gpus = torch.cuda.device_count()
|
|
|
|
|
+ if num_gpus > 1:
|
|
|
|
|
+ device_map = self._build_device_map(model_path, num_gpus)
|
|
|
|
|
+ print(f"[worker] Multi-GPU device_map ({num_gpus} GPUs): {device_map}", flush=True)
|
|
|
|
|
+ else:
|
|
|
|
|
+ device_map = {"": 0}
|
|
|
|
|
+ print(f"[worker] Single GPU device_map: cuda:0", flush=True)
|
|
|
else:
|
|
else:
|
|
|
device_map = "cpu"
|
|
device_map = "cpu"
|
|
|
|
|
+ print("[worker] CPU device_map", flush=True)
|
|
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|
|
model_path, torch_dtype=torch.float16, device_map=device_map,
|
|
model_path, torch_dtype=torch.float16, device_map=device_map,
|
|
|
)
|
|
)
|
|
@@ -117,6 +122,43 @@ class InferenceWorker:
|
|
|
self.torch = torch
|
|
self.torch = torch
|
|
|
print("[worker] Model loaded successfully.", flush=True)
|
|
print("[worker] Model loaded successfully.", flush=True)
|
|
|
|
|
|
|
|
|
|
+ @staticmethod
|
|
|
|
|
+ def _build_device_map(model_path: str, num_gpus: int) -> dict:
|
|
|
|
|
+ """构建多卡 device_map,确保 tied weights 在同一张卡上。
|
|
|
|
|
+
|
|
|
|
|
+ HuggingFace 的 device_map="auto" 有时无法正确处理 tied weights
|
|
|
|
|
+ (embed_tokens 和 lm_head 共享权重),导致它们被分到不同 GPU。
|
|
|
|
|
+ 这里手动构建映射,将 tied weights 强制放在同一张卡。
|
|
|
|
|
+ """
|
|
|
|
|
+ from transformers import AutoConfig
|
|
|
|
|
+
|
|
|
|
|
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
|
|
|
|
+ num_layers = getattr(config, "num_hidden_layers", None)
|
|
|
|
|
+ if num_layers is None:
|
|
|
|
|
+ return "auto"
|
|
|
|
|
+
|
|
|
|
|
+ layers_per_gpu = num_layers // num_gpus
|
|
|
|
|
+ remainder = num_layers % num_gpus
|
|
|
|
|
+ device_map = {}
|
|
|
|
|
+ layer_idx = 0
|
|
|
|
|
+ for gpu in range(num_gpus):
|
|
|
|
|
+ count = layers_per_gpu + (1 if gpu < remainder else 0)
|
|
|
|
|
+ for _ in range(count):
|
|
|
|
|
+ device_map[f"model.layers.{layer_idx}"] = gpu
|
|
|
|
|
+ layer_idx += 1
|
|
|
|
|
+
|
|
|
|
|
+ # 核心:tied weights 强制放在同一张卡(第 0 张)
|
|
|
|
|
+ # embed_tokens 和 lm_head 共享 Embedding 权重
|
|
|
|
|
+ device_map["model.embed_tokens"] = 0
|
|
|
|
|
+ device_map["model.norm"] = 0
|
|
|
|
|
+ device_map["lm_head"] = 0
|
|
|
|
|
+
|
|
|
|
|
+ # Qwen 等模型可能有 rotary_emb
|
|
|
|
|
+ if hasattr(config, "rope_theta") or hasattr(config, "rotary_emb"):
|
|
|
|
|
+ device_map["model.rotary_emb"] = 0
|
|
|
|
|
+
|
|
|
|
|
+ return device_map
|
|
|
|
|
+
|
|
|
def generate(self, request: dict) -> dict:
|
|
def generate(self, request: dict) -> dict:
|
|
|
"""处理一次推理请求。"""
|
|
"""处理一次推理请求。"""
|
|
|
# 支持两种输入:messages(OpenAI 格式)或 prompt(原始文本)
|
|
# 支持两种输入:messages(OpenAI 格式)或 prompt(原始文本)
|