facebookresearch · myleott · Feb 23, 2021 · Jan 28, 2021 · Jan 28, 2021 · Jan 26, 2021
diff --git a/fairscale/nn/data_parallel/__init__.py b/fairscale/nn/data_parallel/__init__.py
@@ -3,4 +3,5 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
+from .fully_sharded_data_parallel import FullyShardedDataParallel
 from .sharded_ddp import ShardedDataParallel
diff --git a/fairscale/nn/data_parallel/fully_sharded_data_parallel.py b/fairscale/nn/data_parallel/fully_sharded_data_parallel.py
diff --git a/fairscale/nn/misc/checkpoint_activations.py b/fairscale/nn/misc/checkpoint_activations.py
@@ -3,8 +3,9 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
+from contextlib import contextmanager
 import functools
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Generator, Optional, Tuple
 
 import torch
 from torch import Tensor
@@ -73,6 +74,23 @@ def set_rng_state(state: Dict[str, Any]) -> None:
         torch.cuda.set_rng_state(state["cuda_rng_state"])
 
 
+def is_autocast_enabled() -> bool:
+    """Similar to torch.is_autocast_enabled, but compatible with torch 1.5.1"""
+    if hasattr(torch, "is_autocast_enabled"):
+        return torch.is_autocast_enabled()
+    return False
+
+
+@contextmanager
+def autocast(enabled: bool) -> Generator:
+    """Similar to torch.cuda.amp.autocast, but compatible with torch 1.5.1"""
+    if enabled:
+        with torch.cuda.amp.autocast(enabled):
+            yield
+    else:
+        yield
+
+
 class CheckpointFunction(torch.autograd.Function):
     """Similar to the torch version, but support non-Tensor outputs.
 
@@ -96,13 +114,13 @@ def forward(  # type: ignore
         ctx.run_function = run_function
         ctx.kwarg_keys = kwarg_keys
         ctx.fwd_rng_state = get_rng_state()
+        ctx.had_autocast_in_fwd = is_autocast_enabled()
 
         tensor_inputs, packed_non_tensor_inputs = split_non_tensors(args)
         if parent_ctx_dict["offload"]:
             ctx.fwd_device = tuple(x.device for x in tensor_inputs)
             ctx.grad_requirements = tuple(x.requires_grad for x in tensor_inputs)
             tensor_inputs = tuple(x.cpu() for x in tensor_inputs)
-
         else:
             ctx.fwd_device, ctx.grad_requirements = None, None
 
@@ -142,10 +160,11 @@ def backward(ctx: Any, *args: Any) -> Tuple[Optional[Tensor], ...]:
         # Set the states to what it used to be before the forward pass.
         set_rng_state(ctx.fwd_rng_state)
 
-        with torch.enable_grad():
+        with torch.enable_grad(), autocast(ctx.had_autocast_in_fwd):
             unpacked_args, unpacked_kwargs = unpack_kwargs(ctx.kwarg_keys, inputs)
             outputs = ctx.run_function(*unpacked_args, **unpacked_kwargs)
             tensor_outputs, _ = split_non_tensors(outputs)
+
         # Set the states back to what it was at the start of this function.
         set_rng_state(bwd_rng_state)
 

diff --git a/fairscale/nn/misc/flatten_params_wrapper.py b/fairscale/nn/misc/flatten_params_wrapper.py
@@ -2,12 +2,15 @@
 # Licensed under the MIT License.
 
 from contextlib import contextmanager
-from typing import Any, Dict, Generator, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, Generator, List, NamedTuple, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
 import torch.nn as nn
 
+if TYPE_CHECKING:
+    from collections import OrderedDict  # noqa: F401
+
 
 class FlattenParamsWrapper(nn.Module):
     """
@@ -127,21 +130,23 @@ def __getattr__(self, name: str) -> Any:
         except AttributeError:
             return getattr(self.module, name)  # fallback to wrapped module
 
-    def state_dict(self, prefix: str = "", keep_vars: bool = False) -> "OrderedDict[str, Tensor]":  # type: ignore
+    def state_dict(self, *args: Any, **kwargs: Any) -> "OrderedDict[str, Tensor]":  # type: ignore
         """Return an unflattened state_dict."""
         with self.unflatten_params():
-            return self.module.state_dict(prefix=prefix, keep_vars=keep_vars)
+            return self.module.state_dict(*args, **kwargs)
 
     def flat_state_dict(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
         """Return the flattened state_dict."""
         return super().state_dict(*args, **kwargs)
 
-    def load_state_dict(self, state_dict: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
+    def load_state_dict(
+        self, state_dict: Union[Dict[str, Tensor], "OrderedDict[str, Tensor]"], strict: bool = True
+    ) -> NamedTuple:
         if "flat_param" in state_dict:
-            super().load_state_dict(state_dict, strict=True)
+            return super().load_state_dict(state_dict, strict=strict)
         else:
             with self.unflatten_params():
-                return self.module.load_state_dict(state_dict, *args, **kwargs)
+                return self.module.load_state_dict(state_dict, strict)
 
     def forward(self, *inputs: Any, **kwinputs: Any) -> Any:
         self._unflatten_params_as_views()

diff --git a/fairscale/optim/oss.py b/fairscale/optim/oss.py
@@ -15,7 +15,7 @@
 from torch.nn import Parameter
 from torch.optim import SGD, Optimizer
 
-from .utils import broadcast_object, recursive_copy_to_device
+from .utils import broadcast_object, calc_grad_norm, recursive_copy_to_device
 
 __all__ = ["OSS"]
 
@@ -284,18 +284,14 @@ def clip_grad_norm(
         # https://github.com/NVIDIA/Megatron-LM/blob/19301985dd31c8b612095cbad15bd903e8ddd497/megatron/mpu/layers.py#L54
         local_params = filter_params_fn(self.local_params) if filter_params_fn is not None else self.local_params
 
+        local_norm = calc_grad_norm(local_params, norm_type).to(self._default_device)
         # Compute the norm on this grad set,
         # then sync all the norms from all ranks
         if norm_type == inf:
-            total_norm = max(p.grad.detach().abs().max().to(self._default_device) for p in local_params)
+            total_norm = local_norm
             # all reduce over data parallel and model parallel workers
             dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=dist.group.WORLD)
         else:
-            local_norm = torch.norm(
-                input=torch.stack([torch.norm(input=p.grad.detach(), p=norm_type, dtype=torch.float32).to(self._default_device) for p in local_params]),  # type: ignore
-                p=norm_type,
-            )
-
             # local norm result can be accumulated with the remote ones if put to the right power
             # n_i = sum_rank(a^p)^1/p
             # -> n_total = all_reduce(n_i^p)^(1/p) = sum_i(n_i^p)^1/p = sum_i(sum_rank(a^p))^1/p

diff --git a/fairscale/optim/utils.py b/fairscale/optim/utils.py
@@ -5,7 +5,8 @@
 
 import collections
 import io
-from typing import Any, Callable, Dict, Optional
+from math import inf
+from typing import Any, Callable, Dict, List, Optional
 
 import torch
 import torch.distributed as dist
@@ -102,3 +103,22 @@ def reset(self) -> None:
     def full(self) -> bool:
         """ is the bucket full ? """
         return self.max_params_checked_in == self.params_checked_in
+
+
+def calc_grad_norm(parameters: List[torch.nn.Parameter], p: float) -> torch.Tensor:
+    r"""Calculate gradient norm of an iterable of parameters.
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda par: par.grad is not None, parameters))
+
+    if len(parameters) == 0:
+        return torch.tensor(0.0)
+    p = float(p)
+    if p == inf:
+        local_norm = max(par.grad.detach().abs().max() for par in parameters)  # type: ignore
+    else:
+        local_norm = torch.norm(torch.stack([torch.norm(par.grad.detach(), p) for par in parameters]), p)  # type: ignore
+    return local_norm
diff --git a/fairscale/utils/parallel.py b/fairscale/utils/parallel.py
@@ -0,0 +1,45 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Useful functions for parallel training."""
+
+from typing import List
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+import torch.nn.functional as F
+
+
+def chunk_and_pad(tensor: torch.Tensor, num_chunks: int) -> List[torch.Tensor]:
+    """Chunk a given Tensor into num_chunks parts and add any necessary padding."""
+    chunks = list(torch.flatten(tensor).chunk(num_chunks))
+    # torch.chunk may return fewer than num_chunks chunks, pad accordingly.
+    num_pad_for_partial_chunk = chunks[0].numel() - chunks[-1].numel()
+    if num_pad_for_partial_chunk > 0:
+        chunks[-1] = F.pad(chunks[-1], [0, num_pad_for_partial_chunk])
+    if len(chunks) < num_chunks:
+        chunks.extend([torch.zeros_like(chunks[0]) for _ in range(num_chunks - len(chunks))])
+    return chunks
+
+
+def validate_process_group(device: torch.device, process_group: ProcessGroup) -> None:
+    """Do a quick test in case user called FSDP without calling torch.cuda.set_device()
+       correctly. This can easily happen in cpu_offload case where the model resides on
+       the CPU.
+    """
+    if not hasattr(process_group, "allgather"):
+        # Likely a dummy pg for unit test, skip checking.
+        return
+
+    world_size = process_group.size()
+    if "cuda" in str(device):
+        input_tensor = torch.ones(1).to(device)
+        output = list(torch.zeros(world_size).to(device).chunk(world_size))
+        dist.all_gather(output, input_tensor, group=process_group)
+        assert torch.cat(output).sum() == float(world_size), (
+            f"found {torch.cat(output).sum()} devices in process group but "
+            f"world_size={world_size}. Check torch.cuda.set_device is called properly"
+        )
diff --git a/fairscale/utils/reduce_scatter_bucketer.py b/fairscale/utils/reduce_scatter_bucketer.py
@@ -0,0 +1,151 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+from torch import Tensor
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+
+class Bucket:
+    def __init__(self, data: Tensor, group: ProcessGroup):
+        self.data = data
+        self.group = group
+        self.offset = 0
+        self.callbacks: List[Callable] = []
+        self.output_shard = torch.zeros_like(data[0])
+
+    def flush(self) -> None:
+        if self.offset == 0:
+            assert len(self.callbacks) == 0
+            return
+        # reduce-scatter bucket
+        dist.reduce_scatter(
+            self.output_shard[: self.offset], list(self.data[:, : self.offset].unbind(0)), group=self.group
+        )
+        # execute post-reduction callbacks
+        for callback_fn in self.callbacks:
+            callback_fn()
+        # reuse input bucket but allocate a fresh output shard
+        self.data[:, : self.offset].zero_()
+        self.offset = 0
+        self.callbacks.clear()
+        self.output_shard = torch.zeros_like(self.data[0])
+
+
+class ReduceScatterBucketer:
+    """
+    Helper for bucketing multiple reduce-scatter operations on small tensors
+    into larger reduce-scatter ops to improve communication efficiency.
+
+    Usage::
+
+        bucketer = ReduceScatterBucketer()
+        bucketer.reduce_scatter_async(
+            small_tensors, callback_fn=lambda result: print("small")
+        )
+        bucketer.reduce_scatter_async(
+            big_tensors, callback_fn=lambda result: print("big")
+        )
+        bucketer.reduce_scatter_async(
+            more_small_tensors, callback_fn=lambda result: print("small2")
+        )
+        bucketer.flush()  # callbacks only guaranteed to be called after flush()
+        # Example output (note that it is out of order, due to bucketing):
+        # big
+        # small
+        # small2
+
+    Args:
+        bucket_cap_mb (int, Optional): bucket size for communicating. Buckets
+            are sub-divided based on world_size. Values <= 0 disable bucketing.
+    """
+
+    def __init__(self, bucket_cap_mb: int = 25):
+        self.bucket_cap_mb = bucket_cap_mb
+        self.buckets: Dict[Tuple[torch.dtype, torch.device, ProcessGroup], Bucket] = {}
+
+    @torch.no_grad()
+    def reduce_scatter_async(
+        self, input_list: List[Tensor], group: ProcessGroup, callback_fn: Optional[Callable] = None,
+    ) -> None:
+        """
+        Reduce-scatter a list of tensors asynchronously, so smaller reductions
+        can be bucketed together. The given callback (``callback_fn``) will be
+        called with the reduced result at some later time. Call ``flush()`` to
+        force all queued ops and callbacks to be executed.
+
+        Note that large inputs will be reduced immediately, and this function
+        may also flush the relevant bucket to make room for ``input_list``.
+
+        Args:
+            input_list (List[Tensor]): list of tensors to reduce-scatter. List
+                should contain ``group.size()`` tensors and each tensor should
+                have identical shape, dtype and device.
+            group (ProcessGroup): process group for reduction
+            callback_fn (Callable, Optional): callback function to call after
+                the reduction executes. Function will be called with a single
+                argument corresponding to the reduced result.
+        """
+        world_size = group.size()
+
+        assert (
+            len(input_list) == world_size
+        ), f"reduce_scatter received {len(input_list)} inputs, expected group.size() ({world_size})"
+
+        first_input = input_list[0]
+        first_input_size = first_input.numel()
+
+        bucket_shard_size = self._get_shard_size(first_input.element_size(), world_size)
+        if first_input_size > bucket_shard_size:
+            # input is too big to fit in the bucket, reduce-scatter directly
+            output = torch.zeros_like(input_list[0])
+            dist.reduce_scatter(output, input_list, group=group)
+            if callback_fn is not None:
+                callback_fn(output)
+            return
+
+        bucket = self._get_bucket(first_input, group)
+        if first_input_size > bucket.data.size(1) - bucket.offset:
+            # not enough space remaining in bucket, flush it now
+            bucket.flush()
+
+        # copy data from input_list into bucket
+        stacked_input = torch.stack(input_list).view(world_size, first_input_size)
+        offset = bucket.offset
+        bucket.data[:, offset : offset + first_input_size].copy_(stacked_input)
+        bucket.offset += first_input_size
+
+        # callback will be given the reduced result
+        if callback_fn is not None:
+            result_view = bucket.output_shard[offset : offset + first_input_size].view_as(first_input)
+            bucket.callbacks.append(functools.partial(callback_fn, result_view))
+
+    @torch.no_grad()
+    def flush(self) -> None:
+        """Reduce-scatter any partial buckets."""
+        for bucket in self.buckets.values():
+            bucket.flush()
+
+    @functools.lru_cache()
+    def _get_shard_size(self, element_size: int, num_shards: int) -> int:
+        if self.bucket_cap_mb <= 0:  # Values <= 0 disable bucketing.
+            return 0
+        MB = 1024 * 1024
+        bucket_size = self.bucket_cap_mb * MB / element_size
+        return int(bucket_size // num_shards)
+
+    def _get_bucket(self, tensor: Tensor, group: ProcessGroup) -> Bucket:
+        key = (tensor.dtype, tensor.device, group)
+        if key not in self.buckets:
+            # buckets are divided into world_size pieces, bucket.data shaped (world_size, shard_size)
+            world_size = group.size()
+            shard_size = self._get_shard_size(tensor.element_size(), world_size)
+            data = tensor.new_zeros((world_size, shard_size))
+            self.buckets[key] = Bucket(data, group)
+        return self.buckets[key]