|
|
@@ -40,6 +40,7 @@ class TextEngine(BaseEngine):
|
|
|
|
|
|
# 如果本地没有,从 HF 下载
|
|
|
if not (Path(local_path) / "config.json").exists():
|
|
|
+ logger.info(f"Model not found locally, downloading from HuggingFace: {model_id}")
|
|
|
from huggingface_hub import snapshot_download
|
|
|
|
|
|
snapshot_download(
|
|
|
@@ -47,9 +48,10 @@ class TextEngine(BaseEngine):
|
|
|
local_dir=local_path,
|
|
|
local_dir_use_symlinks=False,
|
|
|
)
|
|
|
+ logger.info(f"Model download completed: {model_id}")
|
|
|
|
|
|
quantization = kwargs.get("quantization", None)
|
|
|
-
|
|
|
+
|
|
|
# 日志:检查 GPU 状态
|
|
|
logger.info(f"CUDA available: {torch.cuda.is_available()}")
|
|
|
logger.info(f"CUDA device count: {torch.cuda.device_count()}")
|
|
|
@@ -59,8 +61,14 @@ class TextEngine(BaseEngine):
|
|
|
logger.info(f"GPU {i} memory: {torch.cuda.get_device_properties(i).total_memory / (1024**3):.2f} GB")
|
|
|
else:
|
|
|
logger.warning("No GPU detected! Training will run on CPU.")
|
|
|
-
|
|
|
- max_memory = {i: "4GB" for i in range(torch.cuda.device_count())} if torch.cuda.is_available() else None
|
|
|
+
|
|
|
+ logger.info(f"Loading model from: {local_path} (quantization={quantization})")
|
|
|
+
|
|
|
+ max_memory = None
|
|
|
+ if torch.cuda.is_available():
|
|
|
+ # 不限制 max_memory,让 transformers 自动利用全部显存
|
|
|
+ max_memory = {i: f"{int(torch.cuda.get_device_properties(i).total_memory * 0.9 // (1024**3))}GiB"
|
|
|
+ for i in range(torch.cuda.device_count())}
|
|
|
|
|
|
load_kwargs: dict[str, Any] = {
|
|
|
"torch_dtype": torch.float16,
|
|
|
@@ -84,6 +92,7 @@ class TextEngine(BaseEngine):
|
|
|
|
|
|
self._model = AutoModelForCausalLM.from_pretrained(local_path, **load_kwargs)
|
|
|
logger.info(f"Loaded model: {model_id}")
|
|
|
+ logger.info(f"Model loading complete, returning from load_model()")
|
|
|
|
|
|
def get_peft_config(self, method: str, params: dict[str, Any]) -> Any:
|
|
|
"""根据 PEFT 方法返回对应的配置对象。"""
|
|
|
@@ -142,12 +151,21 @@ class TextEngine(BaseEngine):
|
|
|
save_strategy = training_args.get("save_strategy", "epoch")
|
|
|
deepspeed_config = training_args.get("deepspeed", None)
|
|
|
|
|
|
+ logger.info(f"Training args: task_type={task_type}, epochs={epochs}, batch_size={batch_size}, "
|
|
|
+ f"gradient_accumulation={gradient_accumulation}, lr={learning_rate}, "
|
|
|
+ f"max_seq_length={max_seq_length}, warmup_ratio={warmup_ratio}, "
|
|
|
+ f"save_strategy={save_strategy}, deepspeed={'enabled' if deepspeed_config else 'disabled'}")
|
|
|
+ logger.info(f"Loading dataset from: {dataset_path}")
|
|
|
+
|
|
|
dataset = self._tokenize_dataset(dataset_path, max_seq_length)
|
|
|
+ logger.info(f"Dataset tokenized: {len(dataset)} samples")
|
|
|
|
|
|
self._model = get_peft_model(self._model, peft_config)
|
|
|
+ logger.info(f"PEFT model created, trainable parameters:")
|
|
|
self._model.print_trainable_parameters()
|
|
|
|
|
|
output_dir = str(settings.adapters_dir / job_id)
|
|
|
+ logger.info(f"Adapter output directory: {output_dir}")
|
|
|
tr_args = TrainingArguments(
|
|
|
output_dir=output_dir,
|
|
|
num_train_epochs=epochs,
|
|
|
@@ -235,7 +253,9 @@ class TextEngine(BaseEngine):
|
|
|
)
|
|
|
|
|
|
try:
|
|
|
+ logger.info(f"Trainer created, starting trainer.train()...")
|
|
|
trainer.train()
|
|
|
+ logger.info(f"trainer.train() returned, saving adapter...")
|
|
|
self._model.save_pretrained(output_dir)
|
|
|
self._tokenizer.save_pretrained(output_dir)
|
|
|
logger.info(f"Training completed for job {job_id}")
|