facebookresearch · blefaudeux · Aug 14, 2020 · Aug 14, 2020 · Aug 14, 2020 · Aug 14, 2020
diff --git a/fairscale/optim/oss.py b/fairscale/optim/oss.py
@@ -11,10 +11,11 @@
 import torch.distributed as dist
 from torch.optim import SGD, Optimizer
 
-from .utils import broadcast_object, recursive_copy_to_device
+from .utils import batch_broadcast, broadcast_object, recursive_copy_to_device
 
 if TYPE_CHECKING:  # pragma: no cover
     from torch.optim.optimizer import _params_t
+    from torch.nn import Parameter
 else:
     _params_t = Any
 
@@ -43,12 +44,22 @@ class OSS(Optimizer):
             optimizer to shard (default: SGD)
         group (group):
             torch.distributed group (default: group.WORLD)
+        buffer_size (int, optional): number of elements to buffer before
+            performing reduce (default: 32M). Used to reduce multiple small
+            params to avoid communication overhead.
     """
 
     optim: Optimizer
     in_super_constructor: bool
 
-    def __init__(self, params: _params_t, optim: Type[Optimizer] = SGD, group: Any = dist.group.WORLD, **defaults: Any):
+    def __init__(
+        self,
+        params: _params_t,
+        optim: Type[Optimizer] = SGD,
+        group: Any = dist.group.WORLD,
+        buffer_size: int = 2 ** 25,
+        **defaults: Any
+    ):
         # Hold all the nmodel params in the root .param_groups
         self.in_super_constructor = True
         super().__init__(params, defaults)
@@ -65,6 +76,7 @@ def __init__(self, params: _params_t, optim: Type[Optimizer] = SGD, group: Any =
 
         # Current device is set by the parameters allocated to this rank
         self._device = self.partition_parameters()[self.rank][0]["params"][0].device
+        self._buffer = torch.zeros(buffer_size).to(self._device)
 
     def partition_parameters(self) -> List[List[dict]]:
         """Partitions parameters across distributed ranks.
@@ -97,9 +109,32 @@ def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]
 
         # Sync all the states
         for rank, param_groups in enumerate(self.partition_parameters()):
+            # Batch smaller params in a broadcast buffer to speed up the communication
+            buffered_params: List[Parameter] = []
+            buffered_elements = 0
+
             for param_group in param_groups:
                 for param in param_group["params"]:
-                    dist.broadcast(tensor=param, src=rank, group=self.group)
+                    if param.numel() >= self._buffer.numel():
+                        # Big param block, broadcast directly
+                        dist.broadcast(tensor=param, src=rank, group=self.group)
+                    else:
+                        if (buffered_elements + param.numel()) >= self._buffer.numel():
+                            # Batch buffer is full, sync
+                            batch_broadcast(
+                                buffered_params, source_rank=rank, buffer=self._buffer, process_group=self.group
+                            )
+                            buffered_params.clear()
+                            buffered_elements = 0
+
+                        # Keep async and batch sync later
+                        buffered_params.append(param)
+                        buffered_elements += param.numel()
+
+            # Sync whatever is left in the batch buffer before moving to the next rank
+            if buffered_elements > 0:
+                batch_broadcast(buffered_params, source_rank=rank, buffer=self._buffer, process_group=self.group)
+
         return loss
 
     def local_state_dict(self) -> dict:

diff --git a/fairscale/optim/utils.py b/fairscale/optim/utils.py
@@ -4,12 +4,18 @@
 # LICENSE file in the root directory of this source tree.
 
 import io
-from typing import Any, Dict
+from typing import TYPE_CHECKING, Any, Dict, List
 
 import torch
 from torch._six import container_abcs
 import torch.distributed as dist
 
+if TYPE_CHECKING:  # pragma: no cover
+    from torch import Tensor
+    from torch.nn import Parameter
+else:
+    Tensor = Any
+    Parameter = Any
 
 # Credits:  classy_vision/generic/distributed_util.py
 def recursive_copy_to_device(value: Any, non_blocking: bool, device: torch.device) -> Any:
@@ -68,3 +74,27 @@ def broadcast_object(
         buffer = io.BytesIO(data_recv_tensor.cpu().numpy())
         obj = torch.load(buffer, map_location=dist_device)
     return obj
+
+
+def batch_broadcast(
+    buffered_params: List[Parameter], source_rank: int, buffer: Tensor, process_group: Any = None
+) -> None:
+    """ Helper to broadcast a list of params batched into a bigger buffer.
+    NOTE: This skips the grads on purpose, only broadcasts the tensor parameters.
+    NOTE: This also asserts that the parameters will fit in the buffer """
+
+    offset = 0
+    for p in buffered_params:
+        sz = p.numel()
+        buffer[offset : offset + p.numel()].copy_(p.data.view(-1))  # type: ignore
+        offset += sz
+        assert offset < buffer.numel()
+
+    dist.broadcast(tensor=buffer, src=source_rank, group=process_group)
+
+    # copy brodcasted grads back into their original place
+    offset = 0
+    for p in buffered_params:
+        sz = p.numel()
+        p.data.copy_(buffer[offset : offset + sz].view_as(p))  # type: ignore
+        offset += sz
diff --git a/tests/optim/test_oss.py b/tests/optim/test_oss.py
@@ -7,6 +7,7 @@
 # pylint: disable=missing-class-docstring
 # pylint: disable=missing-function-docstring
 
+import math
 import os
 
 import pytest
@@ -140,6 +141,68 @@ def test_step():
     mp.spawn(run_test_step, args=(world_size,), nprocs=world_size, join=True)
 
 
+def run_test_batch_broadcast(rank, world_size):
+    dist_init(rank, world_size)
+    width_multiplier = 3
+    batch_size = 10
+
+    x = torch.ones([batch_size, world_size], device=rank)
+    target = torch.zeros([batch_size, width_multiplier * world_size], device=rank)
+    error = math.factorial(width_multiplier * world_size - 1)
+
+    def get_model():
+        layers = [torch.nn.Linear(i, i + 1) for i in range(world_size, width_multiplier * world_size)]
+        for l in layers:
+            l.weight.data.fill_(1.0)
+            l.bias.data.fill_(0.0)
+
+        m = torch.nn.Sequential(*layers)
+        m.to(rank)
+        return m
+
+    # Set a very small buffer size to force the full param block to be broadcasted
+    m_small = get_model()
+    o = optim.OSS(m_small.parameters(), lr=0.1, buffer_size=8)
+    loss_fn = torch.nn.L1Loss().to(device=rank)
+
+    def closure():
+        o.zero_grad()
+        output = m_small(x)
+        loss = loss_fn(output, target)
+        loss.backward()
+        return loss
+
+    loss = o.step(closure=closure)
+    assert round(loss.item()) == error, f"{loss} vs. expected: {error}"
+
+    loss_update = o.step(closure=closure)
+    assert loss_update.item() < loss.item(), f"{loss.item()} vs {loss_update.item()} loss should decrease"
+
+    # Set a very big buffer size to force all the params to be packed
+    m_large = get_model()
+    o = optim.OSS(m_large.parameters(), lr=0.1, buffer_size=2 ** 26)
+    loss_fn = torch.nn.L1Loss().to(device=rank)
+
+    def closure():
+        o.zero_grad()
+        output = m_large(x)
+        loss = loss_fn(output, target)
+        loss.backward()
+        return loss
+
+    loss = o.step(closure=closure)
+    assert round(loss.item()) == error, f"{loss} vs. expected: {error}"
+
+    loss_update = o.step(closure=closure)
+    assert loss_update.item() < loss.item(), f"{loss.item()} vs {loss_update.item()} loss should decrease"
+
+
+@skip_if_no_cuda
+def test_batch_broadcast():
+    world_size = min(2, torch.cuda.device_count())
+    mp.spawn(run_test_batch_broadcast, args=(world_size,), nprocs=world_size, join=True)
+
+
 def run_test_step_with_closure(rank, world_size, optimizer=None):
     dist_init(rank, world_size)