job_queue.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. import asyncio
  2. import json
  3. from datetime import datetime, timezone
  4. from enum import Enum
  5. from typing import Any, Callable, Coroutine, Optional
  6. from pydantic import BaseModel, Field
  7. from app.core.logging import logger
  8. class JobStatus(str, Enum):
  9. PENDING = "pending"
  10. QUEUED = "queued"
  11. PREPROCESSING = "preprocessing"
  12. TRAINING = "training"
  13. COMPLETED = "completed"
  14. EVALUATING = "evaluating"
  15. EVALUATION_DONE = "evaluation_done"
  16. FAILED = "failed"
  17. CANCELLED = "cancelled"
  18. @property
  19. def is_terminal(self) -> bool:
  20. return self in (self.COMPLETED, self.FAILED, self.CANCELLED, self.EVALUATION_DONE)
  21. class TrainingJob(BaseModel):
  22. id: str
  23. model_id: str
  24. model_type: str
  25. peft_method: str
  26. dataset_id: str
  27. config: dict = Field(default_factory=dict)
  28. status: JobStatus = JobStatus.PENDING
  29. progress: float = 0.0
  30. current_epoch: int = 0
  31. current_step: int = 0
  32. total_steps: int = 0
  33. loss: float | None = None
  34. adapter_path: str | None = None
  35. error_message: str | None = None
  36. created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
  37. started_at: str | None = None
  38. finished_at: str | None = None
  39. class JobQueue:
  40. """异步任务队列,支持取消和并发控制。"""
  41. def __init__(self, max_concurrent: int = 2):
  42. self._queue: asyncio.Queue[str] = asyncio.Queue()
  43. self._jobs: dict[str, TrainingJob] = {}
  44. self._cancel_events: dict[str, asyncio.Event] = {}
  45. self._callbacks: list[Callable[[TrainingJob], Coroutine[Any, Any, None]]] = []
  46. self._max_concurrent = max_concurrent
  47. self._workers: list[asyncio.Task] = []
  48. self._running = False
  49. async def start(self):
  50. """启动后台 worker。"""
  51. if self._running:
  52. return
  53. self._running = True
  54. for _ in range(self._max_concurrent):
  55. worker = asyncio.create_task(self._worker_loop())
  56. self._workers.append(worker)
  57. logger.info(f"JobQueue started with {self._max_concurrent} workers")
  58. async def stop(self):
  59. """停止所有 worker。"""
  60. self._running = False
  61. for event in self._cancel_events.values():
  62. event.set()
  63. for worker in self._workers:
  64. worker.cancel()
  65. self._workers.clear()
  66. logger.info("JobQueue stopped")
  67. async def enqueue(self, job_id: str, job: TrainingJob):
  68. """将任务加入队列。"""
  69. self._jobs[job_id] = job
  70. self._cancel_events[job_id] = asyncio.Event()
  71. await self._queue.put(job_id)
  72. logger.info(f"Job {job_id} enqueued")
  73. async def dequeue(self) -> str:
  74. """从队列中取出任务 ID。"""
  75. return await self._queue.get()
  76. def mark_done(self, job_id: str):
  77. """标记任务完成。"""
  78. self._queue.task_done()
  79. self._cancel_events.pop(job_id, None)
  80. def get_job(self, job_id: str) -> Optional[TrainingJob]:
  81. return self._jobs.get(job_id)
  82. def update_job(self, job_id: str, **kwargs):
  83. if job_id in self._jobs:
  84. job = self._jobs[job_id]
  85. for key, val in kwargs.items():
  86. if hasattr(job, key):
  87. setattr(job, key, val)
  88. def is_cancelled(self, job_id: str) -> bool:
  89. event = self._cancel_events.get(job_id)
  90. return event is not None and event.is_set()
  91. async def cancel(self, job_id: str):
  92. """取消任务。"""
  93. if job_id in self._cancel_events:
  94. self._cancel_events[job_id].set()
  95. self.update_job(job_id, status=JobStatus.CANCELLED)
  96. await self._notify_callbacks()
  97. logger.info(f"Job {job_id} cancelled")
  98. def register_callback(self, callback: Callable[[TrainingJob], Coroutine[Any, Any, None]]):
  99. """注册状态变更回调(用于更新数据库等)。"""
  100. self._callbacks.append(callback)
  101. async def _notify_callbacks(self):
  102. for cb in self._callbacks:
  103. try:
  104. for job in self._jobs.values():
  105. await cb(job)
  106. except Exception as e:
  107. logger.error(f"JobQueue callback error: {e}")
  108. async def _worker_loop(self):
  109. """worker 循环:不断从队列取任务并执行。"""
  110. while self._running:
  111. try:
  112. job_id = await asyncio.wait_for(self._queue.get(), timeout=1.0)
  113. except asyncio.TimeoutError:
  114. continue
  115. try:
  116. await self._run_job(job_id)
  117. except Exception as e:
  118. logger.error(f"Job {job_id} failed: {e}")
  119. self.update_job(job_id, status=JobStatus.FAILED, error_message=str(e))
  120. finally:
  121. self._queue.task_done()
  122. async def _run_job(self, job_id: str):
  123. """执行单个任务:预处理 → 训练 → 完成。"""
  124. job = self._jobs.get(job_id)
  125. if not job:
  126. return
  127. self.update_job(job_id, status=JobStatus.QUEUED)
  128. await self._notify_callbacks()
  129. if self.is_cancelled(job_id):
  130. return
  131. self.update_job(job_id, status=JobStatus.PREPROCESSING, started_at=datetime.now(timezone.utc).isoformat())
  132. await self._notify_callbacks()
  133. if self.is_cancelled(job_id):
  134. return
  135. try:
  136. config = job.config
  137. model_id = job.model_id
  138. model_type = job.model_type
  139. peft_method = job.peft_method
  140. dataset_id = config.get("dataset_id", job.dataset_id)
  141. from app.config import get_settings
  142. settings = get_settings()
  143. # 查找数据集文件路径
  144. dataset_path = await self._lookup_dataset_db(dataset_id)
  145. if not dataset_path:
  146. dataset_path = self._find_dataset_path(dataset_id)
  147. if not dataset_path:
  148. raise FileNotFoundError(f"Dataset not found: {dataset_id}")
  149. # 选择引擎
  150. engine = self._get_engine(model_type)
  151. # 预处理数据集(始终在本地执行)
  152. processed_path = str(settings.processed_dir / f"{job_id}_processed.jsonl")
  153. task_type = config.get("task_type", "sft")
  154. template = config.get("dataset_template", "alpaca")
  155. await engine.preprocess_dataset(dataset_path, processed_path, task_type=task_type, template=template)
  156. # 判断是否远程执行
  157. if settings.use_remote_compute:
  158. # 远程训练模式 — 数据集路径已由上面的代码查好
  159. if not dataset_path:
  160. dataset_path = self._find_dataset_path(dataset_id)
  161. if not dataset_path:
  162. raise FileNotFoundError(f"Dataset not found: {dataset_id}")
  163. self.update_job(job_id, status=JobStatus.TRAINING)
  164. await self._notify_callbacks()
  165. from app.core.remote_executor import run_training_remote, is_process_running
  166. pid = run_training_remote(job_id, model_id, model_type, dataset_path, config)
  167. if not pid:
  168. raise RuntimeError("Failed to launch remote training")
  169. # 轮询共享日志文件解析进度
  170. await self._poll_remote_progress(job_id, pid)
  171. logger.info(f"Remote training launched for job {job_id}")
  172. else:
  173. # 本地训练模式
  174. await engine.load_model(model_id, quantization="4bit" if peft_method == "qlora" else None)
  175. peft_config = engine.get_peft_config(peft_method, config)
  176. self.update_job(job_id, status=JobStatus.TRAINING)
  177. await self._notify_callbacks()
  178. adapter_path = await engine.train(
  179. job_id=job_id,
  180. dataset_path=processed_path,
  181. peft_config=peft_config,
  182. training_args=config,
  183. )
  184. self.update_job(job_id, status=JobStatus.COMPLETED, adapter_path=adapter_path)
  185. await self._notify_callbacks()
  186. logger.info(f"Job {job_id} completed successfully")
  187. except asyncio.CancelledError:
  188. self.update_job(job_id, status=JobStatus.CANCELLED)
  189. await self._notify_callbacks()
  190. except Exception as e:
  191. # 远程训练模式:异常时也要 kill 远程进程
  192. error_msg = str(e)
  193. if settings.use_remote_compute and "pid" in locals():
  194. from app.core.remote_executor import ssh_exec
  195. container = settings.compute_node_docker_container
  196. try:
  197. ssh_exec(
  198. f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; "
  199. f"pkill -9 -P {pid} 2>/dev/null'",
  200. timeout=15,
  201. )
  202. logger.info(f"Killed remote process {pid} due to exception")
  203. except Exception:
  204. pass
  205. logger.error(f"Job {job_id} failed: {error_msg}")
  206. self.update_job(job_id, status=JobStatus.FAILED, error_message=error_msg)
  207. await self._notify_callbacks()
  208. def _find_dataset_path(self, dataset_id: str) -> str | None:
  209. """根据 dataset_id 查找文件路径(数据库或 uploads 目录)。"""
  210. from app.config import get_settings
  211. from pathlib import Path
  212. settings = get_settings()
  213. # 尝试从 uploads 目录查找
  214. upload_path = settings.uploads_dir / dataset_id
  215. if upload_path.exists():
  216. return str(upload_path)
  217. # 如果 dataset_id 本身是路径
  218. if Path(dataset_id).exists():
  219. return dataset_id
  220. return None
  221. async def _lookup_dataset_db(self, dataset_id: str) -> str | None:
  222. """从数据库查找数据集路径。"""
  223. from app.core.db import async_session, DatasetRecord
  224. from sqlalchemy import select
  225. async with async_session() as session:
  226. result = await session.execute(select(DatasetRecord).where(
  227. (DatasetRecord.id == dataset_id) | (DatasetRecord.name == dataset_id)
  228. ))
  229. record = result.scalar_one_or_none()
  230. if record:
  231. return record.file_path
  232. return None
  233. def _get_engine(self, model_type: str):
  234. """根据模型类型选择训练引擎。"""
  235. if model_type == "vision":
  236. from app.engines.vision_engine import vision_engine
  237. return vision_engine
  238. elif model_type == "multimodal":
  239. from app.engines.multimodal_engine import multimodal_engine
  240. return multimodal_engine
  241. else:
  242. from app.engines.text_engine import text_engine
  243. return text_engine
  244. async def _poll_remote_progress(self, job_id: str, pid: str):
  245. """通过 SSH 读取远程日志文件,解析训练进度(非阻塞)。
  246. 同时把 253 容器内的 stderr 日志同步输出到 151 后端日志中。
  247. """
  248. from app.config import get_settings
  249. from app.core.websocket import send_progress, send_epoch_done, send_completed, send_error
  250. from app.core.remote_executor import ssh_exec, is_process_running
  251. settings = get_settings()
  252. remote_log = f"{settings.compute_node_remote_data_dir}/logs/{job_id}.jsonl"
  253. container = settings.compute_node_docker_container
  254. last_bytes = 0
  255. stderr_last_bytes = 0 # 跟踪 stderr 日志读取位置
  256. poll_interval = 5
  257. max_polls = 8640
  258. consecutive_empty_polls = 0
  259. max_consecutive_empty = 12 # 60 秒无响应就开始检查 stderr
  260. async def _mark_failed(error_msg: str):
  261. """统一标记失败:先 kill 远程进程,再更新状态。"""
  262. # 先杀远程进程,防止 GPU 一直被占用
  263. try:
  264. await asyncio.to_thread(
  265. ssh_exec,
  266. f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; "
  267. f"pkill -9 -P {pid} 2>/dev/null'",
  268. timeout=15,
  269. )
  270. logger.info(f"Killed remote process {pid} for job {job_id}")
  271. except Exception:
  272. pass
  273. self.update_job(job_id, status=JobStatus.FAILED, error_message=error_msg)
  274. await self._notify_callbacks()
  275. await send_error(job_id, error_msg)
  276. for _ in range(max_polls):
  277. if self.is_cancelled(job_id):
  278. await asyncio.to_thread(
  279. ssh_exec,
  280. f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; "
  281. f"pkill -9 -P {pid} 2>/dev/null'",
  282. timeout=15,
  283. )
  284. self.update_job(job_id, status=JobStatus.CANCELLED)
  285. await self._notify_callbacks()
  286. await send_error(job_id, "Training cancelled")
  287. return
  288. # 检查进程是否还在运行(非阻塞)
  289. process_alive = await asyncio.to_thread(is_process_running, pid)
  290. # === 1. 读取 jsonl 进度日志 ===
  291. cat_cmd = f"docker exec {container} bash -c 'wc -c < {remote_log} 2>/dev/null || echo 0'"
  292. code, size_out, _ = await asyncio.to_thread(ssh_exec, cat_cmd, timeout=30)
  293. try:
  294. file_size = int(size_out.strip()) if code == 0 and size_out.strip() else 0
  295. except ValueError:
  296. file_size = 0
  297. has_new_log = False
  298. if file_size > last_bytes:
  299. read_cmd = f"docker exec {container} bash -c 'tail -c +{last_bytes + 1} {remote_log} 2>/dev/null'"
  300. code, log_content, _ = await asyncio.to_thread(ssh_exec, read_cmd, timeout=30)
  301. if code == 0 and log_content.strip():
  302. has_new_log = True
  303. consecutive_empty_polls = 0
  304. for line in log_content.strip().split("\n"):
  305. line = line.strip()
  306. if not line:
  307. continue
  308. try:
  309. entry = json.loads(line)
  310. except json.JSONDecodeError:
  311. continue
  312. entry_type = entry.get("type")
  313. if entry_type == "progress":
  314. self.update_job(job_id,
  315. current_step=entry.get("step", 0),
  316. total_steps=entry.get("total_steps", 0),
  317. loss=entry.get("loss"),
  318. progress=round(entry.get("step", 0) / max(entry.get("total_steps", 1), 1) * 100, 1))
  319. await self._notify_callbacks()
  320. await send_progress(job_id, **{k: v for k, v in entry.items() if k != "type"})
  321. elif entry_type == "epoch_begin":
  322. self.update_job(job_id, current_epoch=entry.get("epoch", 0))
  323. await self._notify_callbacks()
  324. elif entry_type == "epoch_done":
  325. await self._notify_callbacks()
  326. await send_epoch_done(job_id, **{k: v for k, v in entry.items() if k not in ("type", "ts")})
  327. elif entry_type == "completed":
  328. adapter_path = entry.get("adapter_path", str(settings.adapters_dir / job_id))
  329. self.update_job(job_id,
  330. status=JobStatus.COMPLETED,
  331. adapter_path=adapter_path,
  332. progress=100.0)
  333. await self._notify_callbacks()
  334. await send_completed(job_id, **{k: v for k, v in entry.items() if k not in ("type", "ts")})
  335. return
  336. elif entry_type == "error":
  337. error_msg = entry.get("message", "Unknown error")
  338. logger.error(f"Remote job {job_id} failed: {error_msg}")
  339. await _mark_failed(error_msg)
  340. return
  341. last_bytes = file_size
  342. # === 2. 同步 253 stderr 日志到 151 后端日志 ===
  343. stderr_cmd = f"docker exec {container} bash -c 'wc -c < /tmp/train_{job_id}.log 2>/dev/null || echo 0'"
  344. code, stderr_size_out, _ = await asyncio.to_thread(ssh_exec, stderr_cmd, timeout=30)
  345. try:
  346. stderr_size = int(stderr_size_out.strip()) if code == 0 and stderr_size_out.strip() else 0
  347. except ValueError:
  348. stderr_size = 0
  349. if stderr_size > stderr_last_bytes:
  350. read_stderr_cmd = f"docker exec {container} bash -c 'tail -c +{stderr_last_bytes + 1} /tmp/train_{job_id}.log 2>/dev/null'"
  351. code, stderr_content, _ = await asyncio.to_thread(ssh_exec, read_stderr_cmd, timeout=30)
  352. if code == 0 and stderr_content.strip():
  353. for line in stderr_content.strip().split("\n"):
  354. line = line.strip()
  355. if not line:
  356. continue
  357. # 识别日志级别
  358. if "[remote_train]" in line:
  359. logger.info(f"[253:{job_id[:8]}] {line}")
  360. elif "[MXKW][E]" in line or "ERROR" in line or "Error" in line:
  361. logger.error(f"[253:{job_id[:8]}] {line}")
  362. elif "[transformers]" in line or "UserWarning" in line or "Warning" in line:
  363. logger.warning(f"[253:{job_id[:8]}] {line}")
  364. else:
  365. logger.info(f"[253:{job_id[:8]}] {line}")
  366. stderr_last_bytes = stderr_size
  367. if not has_new_log:
  368. consecutive_empty_polls += 1
  369. # 进程已退出但日志里没有 completed/error
  370. if not process_alive:
  371. # 多等几秒让日志写完
  372. await asyncio.sleep(2)
  373. if not await asyncio.to_thread(is_process_running, pid):
  374. # 进程退出但没有写 completed/error 日志,读取 stderr 日志兜底
  375. error_msg = f"Remote process exited unexpectedly (pid={pid})"
  376. try:
  377. from app.core.remote_executor import get_remote_stderr
  378. stderr_content = await asyncio.to_thread(get_remote_stderr, job_id)
  379. if stderr_content:
  380. error_msg = stderr_content[-1000:]
  381. except Exception:
  382. pass
  383. logger.error(f"Remote job {job_id} failed: {error_msg}")
  384. await _mark_failed(error_msg)
  385. return
  386. # 长时间无日志且进程异常,也标记为失败
  387. if consecutive_empty_polls >= max_consecutive_empty and not process_alive:
  388. error_msg = f"Remote process exited unexpectedly (pid={pid}), no error log found"
  389. logger.error(f"Remote job {job_id} failed: {error_msg}")
  390. await _mark_failed(error_msg)
  391. return
  392. await asyncio.sleep(poll_interval)
  393. # 超时
  394. error_msg = "Remote training timed out"
  395. logger.error(f"Remote job {job_id} failed: {error_msg}")
  396. await _mark_failed(error_msg)
  397. @property
  398. def jobs(self) -> dict[str, TrainingJob]:
  399. return dict(self._jobs)
  400. # 全局单例
  401. job_queue = JobQueue(max_concurrent=2)