ephemeral_ports.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. """
  2. Helpers to reconcile gpustack's configured port ranges (service, Ray) with the
  3. Linux kernel's ephemeral port range.
  4. When an outbound TCP connection is made, the kernel picks a local source port
  5. from `net.ipv4.ip_local_port_range` (default 32768-60999). gpustack's default
  6. `ray_port_range` (41000-41999) and `service_port_range` (40000-40063) fall
  7. inside this window, so a worker-side outbound connection (e.g. gpustack-worker
  8. talking to the server on :80) can transiently squat on a port that Ray or an
  9. inference server later tries to bind(), causing "Address already in use".
  10. The fix is to append the gpustack ranges to `ip_local_reserved_ports`. This
  11. module detects the conflict and, when running with enough privilege, applies
  12. the reservation automatically. Otherwise it logs the exact sysctl command the
  13. operator must run.
  14. """
  15. from __future__ import annotations
  16. import logging
  17. from pathlib import Path
  18. from typing import List, Optional, Tuple
  19. from gpustack import envs
  20. from gpustack.utils import platform
  21. logger = logging.getLogger(__name__)
  22. _LOCAL_PORT_RANGE_PATH = Path("/proc/sys/net/ipv4/ip_local_port_range")
  23. _RESERVED_PORTS_PATH = Path("/proc/sys/net/ipv4/ip_local_reserved_ports")
  24. Range = Tuple[int, int]
  25. def _parse_ranges(text: str) -> List[Range]:
  26. """
  27. Parse a sysctl port list (comma-separated ports and N1-N2 ranges) into
  28. a list of inclusive (start, end) tuples. Returns [] on empty input.
  29. """
  30. ranges: List[Range] = []
  31. for token in text.replace(",", " ").split():
  32. if not token:
  33. continue
  34. if "-" in token:
  35. a, b = token.split("-", 1)
  36. ranges.append((int(a), int(b)))
  37. else:
  38. p = int(token)
  39. ranges.append((p, p))
  40. return ranges
  41. def _merge(ranges: List[Range]) -> List[Range]:
  42. """
  43. Merge overlapping/adjacent ranges. Result is sorted and disjoint.
  44. """
  45. if not ranges:
  46. return []
  47. ordered = sorted(ranges)
  48. merged: List[Range] = [ordered[0]]
  49. for start, end in ordered[1:]:
  50. last_start, last_end = merged[-1]
  51. if start <= last_end + 1:
  52. merged[-1] = (last_start, max(last_end, end))
  53. else:
  54. merged.append((start, end))
  55. return merged
  56. def _format_ranges(ranges: List[Range]) -> str:
  57. parts: List[str] = []
  58. for start, end in ranges:
  59. parts.append(str(start) if start == end else f"{start}-{end}")
  60. return ",".join(parts)
  61. def _covered_by(target: Range, ranges: List[Range]) -> bool:
  62. start, end = target
  63. for rs, re_ in ranges:
  64. if rs <= start and re_ >= end:
  65. return True
  66. return False
  67. def _overlaps(a: Range, b: Range) -> bool:
  68. return a[0] <= b[1] and b[0] <= a[1]
  69. def _read_port_range(path: Path) -> Optional[Range]:
  70. try:
  71. text = path.read_text().strip()
  72. except OSError as e:
  73. logger.debug("Cannot read %s: %s", path, e)
  74. return None
  75. parts = text.split()
  76. if len(parts) != 2:
  77. logger.debug("Unexpected content in %s: %r", path, text)
  78. return None
  79. try:
  80. return int(parts[0]), int(parts[1])
  81. except ValueError:
  82. logger.debug("Non-numeric content in %s: %r", path, text)
  83. return None
  84. def _read_reserved_ports(path: Path) -> Optional[List[Range]]:
  85. """
  86. Returns the currently reserved ranges, [] if the file is missing/empty,
  87. or None if the content is unparseable — in which case callers should
  88. abort rather than risk overwriting user-managed reservations.
  89. """
  90. try:
  91. text = path.read_text().strip()
  92. except OSError as e:
  93. logger.debug("Cannot read %s: %s", path, e)
  94. return []
  95. try:
  96. return _merge(_parse_ranges(text))
  97. except ValueError:
  98. logger.warning(
  99. "Cannot parse %s: %r. Skipping auto-reservation to avoid "
  100. "overwriting existing configuration.",
  101. path,
  102. text,
  103. )
  104. return None
  105. def ensure_reserved_against_ephemeral(
  106. port_ranges: List[Tuple[str, Range]],
  107. ) -> None:
  108. """
  109. Ensure each given port range is reserved against the kernel's ephemeral
  110. range on Linux. No-op on other platforms or when /proc sysctls are
  111. unreadable (e.g., unprivileged container, unusual kernels).
  112. port_ranges: list of (human_name, (start, end)) tuples.
  113. """
  114. if envs.SKIP_RESERVE_EPHEMERAL_PORTS:
  115. logger.info(
  116. "Skipping ephemeral port reservation because "
  117. "GPUSTACK_SKIP_RESERVE_EPHEMERAL_PORTS is set."
  118. )
  119. return
  120. if platform.system() != "linux":
  121. return
  122. if not _LOCAL_PORT_RANGE_PATH.exists():
  123. return
  124. ephemeral = _read_port_range(_LOCAL_PORT_RANGE_PATH)
  125. if ephemeral is None:
  126. return
  127. reserved = _read_reserved_ports(_RESERVED_PORTS_PATH)
  128. if reserved is None:
  129. return
  130. conflicts: List[Tuple[str, Range]] = []
  131. for name, rng in port_ranges:
  132. if not _overlaps(rng, ephemeral):
  133. continue
  134. if _covered_by(rng, reserved):
  135. continue
  136. conflicts.append((name, rng))
  137. if not conflicts:
  138. return
  139. desired = _merge(reserved + [rng for _, rng in conflicts])
  140. payload = _format_ranges(desired)
  141. conflict_desc = ", ".join(
  142. f"{name}={start}-{end}" for name, (start, end) in conflicts
  143. )
  144. try:
  145. _RESERVED_PORTS_PATH.write_text(payload)
  146. except OSError as e:
  147. logger.warning(
  148. "gpustack port ranges (%s) overlap the kernel ephemeral port "
  149. "range %d-%d and are not reserved. Ray or inference servers may "
  150. "fail to bind when the kernel transiently assigns one of these "
  151. "ports as an outbound ephemeral port. Failed to auto-reserve "
  152. "(%s). Run on each worker host:\n"
  153. " echo 'net.ipv4.ip_local_reserved_ports = %s' | "
  154. "sudo tee -a /etc/sysctl.conf && sudo sysctl -p",
  155. conflict_desc,
  156. ephemeral[0],
  157. ephemeral[1],
  158. e,
  159. payload,
  160. )
  161. return
  162. logger.info(
  163. "Reserved gpustack port ranges (%s) against kernel ephemeral range "
  164. "%d-%d via %s (now: %s).",
  165. conflict_desc,
  166. ephemeral[0],
  167. ephemeral[1],
  168. _RESERVED_PORTS_PATH,
  169. payload,
  170. )