huggingface · Mr-Neutr0n · Feb 7, 2026 · Jun 1, 2026
diff --git a/src/diffusers/models/_modeling_parallel.py b/src/diffusers/models/_modeling_parallel.py
@@ -22,6 +22,7 @@
 import torch.distributed as dist
 
 from ..utils import get_logger
+from ..utils.torch_utils import get_device
 
 
 if TYPE_CHECKING:
@@ -290,7 +291,10 @@ def gather_size_by_comm(size: int, group: dist.ProcessGroup) -> List[int]:
     # HACK: Use Gloo backend for all_gather to avoid H2D and D2H overhead
     comm_backends = str(dist.get_backend(group=group))
     # NOTE: e.g., dist.init_process_group(backend="cpu:gloo,cuda:nccl")
-    gather_device = "cpu" if "cpu" in comm_backends else torch.accelerator.current_accelerator()
+    # get_device() handles accelerator version compatibility internally
+    # (cuda/npu/xpu/mps/mlu/cpu), so we don't need the hasattr(torch, "accelerator")
+    # check here.
+    gather_device = "cpu" if "cpu" in comm_backends else get_device()
     gathered_sizes = [torch.empty((1,), device=gather_device, dtype=torch.int64) for _ in range(world_size)]
     dist.all_gather(
         gathered_sizes,