): + logger.warning( + "vLLM only supports Linux platform (including WSL) and MacOS." + "Building on %s, " diff --git a/misc/py-vllm/files/patch-vllm_distributed_parallel__state.py b/misc/py-vllm/files/patch-vllm_distributed_parallel__state.py new file mode 100644 index 000000000000..cebb4198f919 --- /dev/null +++ b/misc/py-vllm/files/patch-vllm_distributed_parallel__state.py @@ -0,0 +1,35 @@ +--- vllm/distributed/parallel_state.py.orig 2026-04-07 18:19:15 UTC ++++ vllm/distributed/parallel_state.py +@@ -24,6 +24,7 @@ import contextlib + """ + + import contextlib ++import sys + import gc + import pickle + import weakref +@@ -341,8 +342,13 @@ class GroupCoordinator: + ) + # a group with `gloo` backend, to allow direct coordination between + # processes through the CPU. ++ # On FreeBSD, gloo TCP transport is unavailable; use fake backend. ++ _cpu_backend = "gloo" ++ if sys.platform.startswith("freebsd"): ++ import importlib; importlib.import_module("torch.testing._internal.distributed.fake_pg") ++ _cpu_backend = "fake" + with suppress_stdout(): +- cpu_group = torch.distributed.new_group(ranks, backend="gloo") ++ cpu_group = torch.distributed.new_group(ranks, backend=_cpu_backend) + if self.rank in ranks: + self.ranks = ranks + self.world_size = len(ranks) +@@ -1419,6 +1425,9 @@ def init_distributed_environment( + ) + backend = "gloo" + # this backend is used for WORLD ++ # On FreeBSD, register the 'fake' backend before use. ++ if sys.platform.startswith("freebsd") and backend == "fake": ++ import importlib; importlib.import_module("torch.testing._internal.distributed.fake_pg") + torch.distributed.init_process_group( + backend=backend, + init_method=distributed_init_method, diff --git a/misc/py-vllm/files/patch-vllm_platforms_____init____.py b/misc/py-vllm/files/patch-vllm_platforms_____init____.py new file mode 100644 index 000000000000..ab6f660d7de3 --- /dev/null +++ b/misc/py-vllm/files/patch-vllm_platforms_____init____.py @@ -0,0 +1,29 @@ +--- vllm/platforms/__init__.py.orig 2026-04-07 17:26:12 UTC ++++ vllm/platforms/__init__.py +@@ -58,6 +58,11 @@ def cuda_platform_plugin() -> str | None: + + + def cuda_platform_plugin() -> str | None: ++ import sys ++ if sys.platform.startswith("freebsd"): ++ # CUDA extensions are not built on FreeBSD; use CPU platform instead. ++ logger.debug("CUDA platform disabled on FreeBSD.") ++ return None + is_cuda = False + logger.debug("Checking if CUDA platform is available.") + try: +@@ -172,10 +177,12 @@ def cpu_platform_plugin() -> str | None: + if not is_cpu: + import sys + +- is_cpu = sys.platform.startswith("darwin") ++ is_cpu = sys.platform.startswith("darwin") or sys.platform.startswith( ++ "freebsd" ++ ) + if is_cpu: + logger.debug( +- "Confirmed CPU platform is available because the machine is MacOS." ++ "Confirmed CPU platform is available because the machine is MacOS or FreeBSD." + ) + + except Exception as e: diff --git a/misc/py-vllm/files/patch-vllm_platforms_cpu.py b/misc/py-vllm/files/patch-vllm_platforms_cpu.py new file mode 100644 index 000000000000..ae1db6392125 --- /dev/null +++ b/misc/py-vllm/files/patch-vllm_platforms_cpu.py @@ -0,0 +1,42 @@ +--- vllm/platforms/cpu.py.orig 2026-04-03 01:57:10 UTC ++++ vllm/platforms/cpu.py +@@ -74,7 +74,8 @@ class CpuPlatform(Platform): + device_name: str = "cpu" + device_type: str = "cpu" + dispatch_key: str = "CPU" +- dist_backend: str = "gloo" ++ # FreeBSD lacks gloo TCP transport (epoll-based); use fake backend. ++ dist_backend: str = "fake" if sys.platform.startswith("freebsd") else "gloo" + device_control_env_var = "CPU_VISIBLE_MEMORY_NODES" + + @property +@@ -378,7 +379,28 @@ class CpuPlatform(Platform): + + @classmethod + def get_allowed_cpu_core_node_list(cls) -> tuple[list[int], list[LogicalCPUInfo]]: +- assert platform.system() == "Linux" ++ assert platform.system() in ("Linux", "FreeBSD") ++ ++ if platform.system() == "FreeBSD": ++ # FreeBSD lacks lscpu -J; treat all CPUs as a single NUMA node. ++ allowed_cpu_id_set = ( ++ os.sched_getaffinity(0) ++ if hasattr(os, "sched_getaffinity") ++ else set(range(os.cpu_count() or 1)) ++ ) ++ logical_cpu_list = [ ++ LogicalCPUInfo(id=cpu_id, physical_core=cpu_id, numa_node=0) ++ for cpu_id in sorted(allowed_cpu_id_set) ++ ] ++ allowed_numa_nodes_list = [0] ++ env_key = CpuPlatform.device_control_env_var ++ if env_key in os.environ and os.environ[env_key] != "": ++ visible_nodes = [int(s) for s in os.environ[env_key].split(",")] ++ allowed_numa_nodes_list = [ ++ x for x in sorted(list(set(visible_nodes))) ++ if x in allowed_numa_nodes_list ++ ] ++ return allowed_numa_nodes_list, logical_cpu_list + + # Init LogicalCPUInfo from lscpu + lscpu_output = subprocess.check_output( diff --git a/misc/py-vllm/files/patch-vllm_v1_worker_cpu__worker.py b/misc/py-vllm/files/patch-vllm_v1_worker_cpu__worker.py new file mode 100644 index 000000000000..e762ac4eafb9 --- /dev/null +++ b/misc/py-vllm/files/patch-vllm_v1_worker_cpu__worker.py @@ -0,0 +1,12 @@ +--- vllm/v1/worker/cpu_worker.py.orig 2026-04-07 17:26:12 UTC ++++ vllm/v1/worker/cpu_worker.py +@@ -91,6 +91,9 @@ class CPUWorker(Worker): + self.local_omp_cpuid = self._get_autobind_cpu_ids(lambda cpus: cpus) + else: + self.local_omp_cpuid = "nobind" ++ elif omp_cpuids == "auto": ++ # Non-Linux OS: NUMA-based auto-binding not supported, fall back to nobind ++ self.local_omp_cpuid = "nobind" + elif omp_cpuids == "nobind": + self.local_omp_cpuid = "nobind" + else: diff --git a/misc/py-vllm/pkg-descr b/misc/py-vllm/pkg-descr new file mode 100644 index 000000000000..1d4993624aea --- /dev/null +++ b/misc/py-vllm/pkg-descr @@ -0,0 +1,13 @@ +vLLM is a fast and easy-to-use library for LLM inference and serving. +It provides high-throughput and memory-efficient inference for large language +models (LLMs) using state-of-the-art serving technologies including: + +- PagedAttention for efficient KV cache memory management +- Continuous batching of incoming requests +- Optimized CUDA kernels (on supported platforms) +- Hugging Face model compatibility +- Various decoding algorithms including parallel sampling and beam search +- OpenAI-compatible API server + +On FreeBSD, vLLM runs in CPU/empty device mode (VLLM_TARGET_DEVICE=empty), +providing pure Python inference without GPU acceleration.