Upload custom config and model files

Browse files

Files changed (4) hide show

__init__.py +1 -1
aux_losses.py +88 -0
configuration.py +42 -0
moe.py +133 -0

__init__.py CHANGED Viewed

	@@ -1,2 +1,2 @@
1	from .configuration_moegpt import MoEGPTConfig
2	- from .~~modeling_moegpt~~ import MoEGPTForCausalLM


1	from .configuration_moegpt import MoEGPTConfig
2	+ from .modeling import MoEGPTForCausalLM

aux_losses.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def log_mean(x, dim):
+    return torch.logsumexp(x, dim=dim) - torch.log(
+        torch.tensor(x.shape[dim], dtype=torch.float32)
+    )
+def entropy_reg(logits: torch.Tensor, mean_over_batch: bool = True):
+    """Entropy regularization for the router."""
+    entropy_l = lambda l: -(l * l.exp()).sum(-1)
+    # softmax over experts
+    # logits: [batch_size * sequence_length, num_experts]
+    logprobs = F.log_softmax(logits, dim=-1)
+    if mean_over_batch:
+        # take mean probability over batch
+        logprobs = log_mean(logprobs, 0)
+    return -entropy_l(logprobs).mean()
+# two losses below are adapted from
+# https://github.com/google/flaxformer/blob/b725bd2a51d70e866d819c92de166fbf24425e6a/flaxformer/architectures/moe/routing.py
+def load_balancing_loss(logits: torch.Tensor, expert_indices: torch.Tensor) -> float:
+    """Computes auxiliary load balancing loss as in Switch Transformer.
+    See Switch Transformer (https://arxiv.org/abs/2101.03961). This function
+    implements the loss function presented in equations (4) - (6). It aims to
+    penalize those cases where the routing between experts is unbalanced.
+    Args:
+      logits: logits assigned to each expert per token. Shape:
+        <float32>[batch_size * sequence_length, num_experts].
+      expert_indices: <int>[batch_size * sequence_length, num_selected_experts]
+        indices identifying the top num_selected_experts for a given token.
+    Returns:
+      The auxiliary loss.
+    """
+    # num_token = batch_size * sequence_length
+    num_token, num_experts = logits.shape
+    # Shape: [batch_size * sequence_length, num_selected_experts, num_experts].
+    expert_mask = F.one_hot(expert_indices, num_experts)
+    # For a given token, determine if it was routed to a given expert.
+    # Shape: [batch_size * sequence_length, num_experts]
+    expert_mask, _ = torch.max(expert_mask, dim=-2)
+    # shape [num_experts]
+    tokens_per_expert = torch.mean(expert_mask, dim=0, dtype=torch.float32)
+    # compute router probability per expert in log space for numerical stability
+    logprobs = F.log_softmax(logits, dim=-1)
+    # take mean probability over batch
+    # shape [num_experts]
+    logprobs = log_mean(logprobs, dim=0)
+    router_prob_per_expert = torch.exp(logprobs)
+    return (
+        torch.mean(  # mean over experts
+            tokens_per_expert * router_prob_per_expert,
+            dtype=torch.float32,
+        )
+        * num_experts
+    )
+def router_z_loss(router_logits: torch.Tensor) -> float:
+    """Compute router z-loss.
+     The router z-loss was introduced in Designing Effective Sparse Expert Models
+     (https://arxiv.org/abs/2202.08906). It encourages router logits to remain
+     small in an effort to improve stability.
+    Args:
+      router_logits: <float>[batch_size * sequence_length, num_experts]
+        router logits
+    Returns:
+      Scalar router z-loss.
+    """
+    num_tokens, _ = router_logits.shape
+    log_z = torch.logsumexp(router_logits, dim=-1)
+    z_loss = log_z**2
+    return torch.sum(z_loss, dtype=torch.float32) / (num_tokens)

configuration.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from transformers import PretrainedConfig
+class MoEGPTConfig(PretrainedConfig):
+    model_type = "moegpt"
+    def __init__(
+        self,
+        vocab_size=50304,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        sequence_length=1024,
+        moe=False,
+        moe_routing="standard_gating",
+        moe_num_experts=4,
+        moe_num_experts_per_tok=2,
+        moe_softmax_order="softmax_topk",
+        moe_router_loss="load_balancing_z_loss",
+        moe_aux_loss_factor=0.01,
+        moe_z_loss_factor=1.0,
+        mlp_dim_exp_factor=1.0,
+        dropout=0.0,
+        bias=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.sequence_length = sequence_length
+        self.moe = moe
+        self.moe_routing = moe_routing
+        self.moe_num_experts = moe_num_experts
+        self.moe_num_experts_per_tok = moe_num_experts_per_tok
+        self.moe_softmax_order = moe_softmax_order
+        self.moe_router_loss = moe_router_loss
+        self.moe_aux_loss_factor = moe_aux_loss_factor
+        self.moe_z_loss_factor = moe_z_loss_factor
+        self.mlp_dim_exp_factor = mlp_dim_exp_factor
+        self.dropout = dropout
+        self.bias = bias

moe.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""
+Simple MoE routing implementations that replace the MLP block in a standard transformer.
+References:
+1) Mistral Source for Mixtral MoEs:
+https://github.com/mistralai/mistral-src
+2) ST-MoE:
+https://arxiv.org/abs/2202.08906
+3) Our notepad of MoE resources:
+https://docs.google.com/document/d/1NuQ5jr7V-Jv1ui7p4KrxO_JTz-7bpYcYMmh49EeJ-QA/edit?usp=sharing
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import bisect
+class MoE(nn.Module):
+    """
+    Simplest MoE implementation with a linear router and softmax over experts.
+    Note that in this implementation, we simply loop over the experts and
+    aggregate the results. This is not the most efficient way to do it, but
+    it also avoids the large memory overhead _and_ has no token dropping
+    (because we do not need the capacity factor).
+    """
+    def __init__(self, config, mlp):
+        super().__init__()
+        assert config.moe_num_experts > 0
+        self.experts = nn.ModuleList(
+            [mlp(config=config) for _ in range(config.moe_num_experts)]
+        )
+        self.router = nn.Linear(config.n_embd, config.moe_num_experts, bias=False)
+        self.top_k = config.moe_num_experts_per_tok
+        self.softmax_order = config.moe_softmax_order
+    def forward(self, inputs: torch.Tensor):
+        # [batch_size * sequence_length, n_embd]
+        inputs_squashed = inputs.view(-1, inputs.shape[-1])
+        # [batch_size * sequence_length, num_experts]
+        router_logits = self.router(inputs_squashed)
+        # note that selected experts will be the same for all orders:
+        # softmax doesnt change top-k, but the weights are different
+        if self.softmax_order == "softmax_topk":
+            all_probs = F.softmax(router_logits, dim=1)
+            weights, selected_experts = torch.topk(all_probs, self.top_k)
+        elif self.softmax_order == "topk_softmax":
+            weights, selected_experts = torch.topk(router_logits, self.top_k)
+            weights = F.softmax(weights, dim=-1)
+        else:
+            raise ValueError(f"Unknown softmax_order: {self.softmax_order}")
+        results = torch.zeros_like(inputs_squashed)
+        # naive looping over experts
+        for i, expert in enumerate(self.experts):
+            batch_idx, nth_expert = torch.where(selected_experts == i)
+            output, _ = expert(inputs_squashed[batch_idx])
+            results[batch_idx] += weights[batch_idx, nth_expert, None] * output
+        # return results and router logits (for aux loss calculation later)
+        return results.view_as(inputs), {
+            "router_logits": router_logits,
+            "selected_experts": selected_experts,
+        }
+class DummyExpert(nn.Module):
+    def __init__(self, output_size: int):
+        super().__init__()
+        self._output_size = output_size
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        out = torch.zeros((self._output_size,), device=inputs.device)
+        return out, {}
+class MaskedMoE(MoE):
+    def __init__(self, config, mlp):
+        super().__init__(config, mlp)
+        self._sequence_length = config.sequence_length
+        self.experts.append(DummyExpert(config.n_embd))
+        self.router = nn.Linear(config.n_embd, config.moe_num_experts+1, bias=False)
+    def forward(self, inputs: torch.Tensor, mask: torch.Tensor):
+        inputs_squashed = inputs.view(-1, inputs.shape[-1])
+        router_logits = self.router(inputs_squashed)
+        mask = torch.cat(
+            (mask, torch.ones((mask.shape[0], 1), device=mask.device)),
+            dim=1
+        )
+        mask = mask.repeat_interleave(self._sequence_length, dim=0)
+        router_logits = router_logits*mask
+        # note that selected experts will be the same for all orders:
+        # softmax doesnt change top-k, but the weights are different
+        if self.softmax_order == "softmax_topk":
+            all_probs = F.softmax(router_logits, dim=1)
+            weights, selected_experts = torch.topk(all_probs, self.top_k)
+        elif self.softmax_order == "topk_softmax":
+            weights, selected_experts = torch.topk(router_logits, self.top_k)
+            weights = F.softmax(weights, dim=-1)
+        else:
+            raise ValueError(f"Unknown softmax_order: {self.softmax_order}")
+        results = torch.zeros_like(inputs_squashed)
+        # naive looping over experts
+        for i, expert in enumerate(self.experts):
+            batch_idx, nth_expert = torch.where(selected_experts == i)
+            output, _ = expert(inputs_squashed[batch_idx])
+            results[batch_idx] += weights[batch_idx, nth_expert, None] * output
+        # return results and router logits (for aux loss calculation later)
+        return results.view_as(inputs), {
+            "router_logits": router_logits,
+            "selected_experts": selected_experts,
+        }
+class TimeDependantMoE(nn.Module):
+    def __init__(self, config, mlp):
+        super().__init__()
+        self._num_experts = config.moe_num_experts
+        self._mask_moe = MaskedMoE(config, mlp)
+    def forward(self, x, date):
+        mask_date = torch.zeros(x.shape[0], self._num_experts).to(x.device)
+        range_tensor = torch.arange(self._num_experts).unsqueeze(0).to(x.device)
+        mask_date = (range_tensor < date.unsqueeze(1)).float()
+        return self._mask_moe(x, mask_date)