Source code for recwizard.modules.kbrd.transformer_encoder_decoder

""" This file is adapted from the KBRD original implementation: https://github.com/THUDM/KBRD
"""

import math

import torch
import torch.nn as nn

import torch.nn.functional as F

import math
import numpy as np


[docs]class TorchGeneratorModel(nn.Module):
    """
    This Interface expects you to implement model with the following reqs:

    Attributes:
        `START`: LongTensor representing the start of sentence
        `END`: LongTensor representing the end of sentence
        `NULL_IDX`: index of null token
        `model.encoder`: takes input returns tuple (enc_out, enc_hidden, attn_mask)
        `model.decoder`: takes decoder params and returns decoder outputs after attn
        `model.output`: takes decoder outputs and returns distr over dictionary
    """

[docs]    def __init__(
        self,
        padding_idx=0,
        start_idx=1,
        end_idx=2,
        unknown_idx=3,
        input_dropout=0,
        longest_label=1,
    ):
        super().__init__()
        self.NULL_IDX = padding_idx
        self.END_IDX = end_idx
        self.register_buffer("START", torch.LongTensor([start_idx]))
        self.longest_label = longest_label

[docs]    def _starts(self, bsz):
        """Return bsz start tokens."""
        return self.START.detach().expand(bsz, 1)

[docs]    def decode_greedy(self, encoder_states, bsz, maxlen):
        """
        Greedy search

        Args:
            encoder_states: output of the encoder model
            bsz: batch size
            maxlen: max number of tokens to decode

        Return:
            pair (logits, choices) of the greedy decode, shapes: (batch_size, max_len, #vocab), (batch_size, max_len)
        """
        xs = self._starts(bsz)
        incr_state = None
        logits = []
        for i in range(maxlen):
            # todo, break early if all beams saw EOS
            scores, incr_state = self.decoder(xs, encoder_states, incr_state)
            scores = scores[:, -1:, :]
            scores = self.output(scores)
            _, preds = scores.max(dim=-1)
            logits.append(scores)
            xs = torch.cat([xs, preds], dim=1)
            # check if everyone has generated an end token
            all_finished = ((xs == self.END_IDX).sum(dim=1) > 0).sum().item() == bsz
            if all_finished:
                break
        logits = torch.cat(logits, 1)
        return logits, xs

[docs]    def decode_forced(self, encoder_states, ys):
        """
        Decode with a fixed, true sequence, computing loss. Useful for
        training, or ranking fixed candidates.

        Args:
            encoder_states: output of the encoder model, shape: (batch_size, seq_len, hidden_size)
            ys: target tokens, shape: (batch_size, tgt_len)

        Return:
            pair (logits, choices) containing the logits and MLE predictions,
                shapes: (batch_size, tgt_len, #vocab), (batch_size, tgt_len)
        """
        bsz = ys.size(0)
        seqlen = ys.size(1)
        inputs = ys.narrow(1, 0, seqlen - 1)
        inputs = torch.cat([self._starts(bsz), inputs], 1)
        latent, _ = self.decoder(inputs, encoder_states)
        logits = self.output(latent)
        _, preds = logits.max(dim=2)
        return logits, preds

[docs]    def reorder_encoder_states(self, encoder_states, indices):
        """
        Reorder encoder states according to a new set of indices.

        This is an abstract method, and *must* be implemented by the user.

        Its purpose is to provide beam search with a model-agnostic interface for
        beam search. For example, this method is used to sort hypotheses,
        expand beams, etc.

        For example, assume that encoder_states is an bsz x 1 tensor of values

        ```python

            indices = [0, 2, 2]
            encoder_states = [[0.1]
                              [0.2]
                              [0.3]]
        ```

        then the output will be

        ```python

            output = [[0.1]
                      [0.3]
                      [0.3]]
        ```

        Args:
            encoder_states: output from encoder. type is model specific.
            indices (List[int]): the indices to select over. The user must support non-tensor inputs.

        Return:
            The re-ordered encoder states. It should be of the same type as encoder states, and it must be a valid input to the decoder.
        """
        raise NotImplementedError(
            "reorder_encoder_states must be implemented by the model"
        )

[docs]    def reorder_decoder_incremental_state(self, incremental_state, inds):
        """
        Reorder incremental state for the decoder.

        Used to expand selected beams in beam_search. Unlike reorder_encoder_states,
        implementing this method is optional. However, without incremental decoding,
        decoding a single beam becomes O(n^2) instead of O(n), which can make
        beam search impractically slow.

        In order to fall back to non-incremental decoding, just return None from this
        method.

        Args:
            incremental_state: second output of model.decoder
            inds (torch.LongTensor): indices to select and reorder over.

        Returns:
            The re-ordered decoder incremental states. It should be the same
            type as incremental_state, and usable as an input to the decoder.
            This method should return None if the model does not support
            incremental decoding.
        """
        raise NotImplementedError(
            "reorder_decoder_incremental_state must be implemented by model"
        )

[docs]    def forward(
        self, *xs, ys=None, cand_params=None, prev_enc=None, maxlen=None, bsz=None
    ):
        """
        Get output predictions from the model.

        Args:
            xs (torch.LongTensor): input to the encoder, shapes: (batch_size, seq_len)
            ys (torch.LongTensor): expected output from the decoder, shapes: (batch_size, seq_len)
            cand_params (torch.FloatTensor): parameters for candidate generation, shape: (batch_size, num_cands, num_params)
            prev_enc (torch.FloatTensor): if you know you'll pass in the same xs multiple times, you can pass in the encoder
              output from the last forward pass to skip recalcuating the same encoder output.
            maxlen (int): max number of tokens to decode. if not set, will use the length of the longest label this model has seen.
                ignored when ys is not None.
            bsz (int): if ys is not provided, then you must specify the bsz for greedy decoding.

        Returns:
            (scores, candidate_scores, encoder_states) tuple
                - scores contains the model's predicted token scores.
                (FloatTensor[batch_size, seq_len, num_features])
                - candidate_scores are the score the model assigned to each candidate.
                (FloatTensor[batch_size, num_cands])
                - encoder_states are the output of model.encoder. Model specific types.
                Feed this back in to skip encoding on the next call.
        """
        if ys is not None:
            # TODO: get rid of longest_label
            # keep track of longest label we've ever seen
            # we'll never produce longer ones than that during prediction
            self.longest_label = max(self.longest_label, ys.size(1))

        # use cached encoding if available
        encoder_states = prev_enc if prev_enc is not None else self.encoder(*xs)

        if ys is not None:
            # use teacher forcing
            scores, preds = self.decode_forced(encoder_states, ys)
        else:
            scores, preds = self.decode_greedy(
                encoder_states, bsz, maxlen or self.longest_label
            )

        return scores, preds, encoder_states


def _normalize(tensor, norm_layer):
    """Broadcast layer norm"""
    size = tensor.size()
    return norm_layer(tensor.view(-1, size[-1])).view(size)


def _create_embeddings(vocab_size, embedding_size, padding_idx):
    """Create and initialize word embeddings."""
    e = nn.Embedding(vocab_size, embedding_size, padding_idx)
    nn.init.normal_(e.weight, mean=0, std=embedding_size**-0.5)
    nn.init.constant_(e.weight[padding_idx], 0)
    return e


def _build_encoder(config, embedding=None, padding_idx=None, reduction=True):
    return TransformerEncoder(
        n_heads=config.n_heads,
        n_layers=config.n_layers,
        embedding_size=config.gen_dim,
        ffn_size=config.ffn_size,
        vocabulary_size=config.vocab_size,
        embedding=embedding,
        dropout=config.dropout,
        attention_dropout=config.attention_dropout,
        relu_dropout=config.relu_dropout,
        padding_idx=padding_idx,
        learn_positional_embeddings=config.learn_positional_embeddings,
        embeddings_scale=config.embeddings_scale,
        reduction=reduction,
        n_positions=config.n_positions,
    )


def _build_decoder(config, embedding=None, padding_idx=None):
    return TransformerDecoder(
        n_heads=config.n_heads,
        n_layers=config.n_layers,
        embedding_size=config.gen_dim,
        ffn_size=config.ffn_size,
        vocabulary_size=config.vocab_size,
        embedding=embedding,
        dropout=config.dropout,
        attention_dropout=config.attention_dropout,
        relu_dropout=config.relu_dropout,
        padding_idx=padding_idx,
        learn_positional_embeddings=config.learn_positional_embeddings,
        embeddings_scale=config.embeddings_scale,
        n_positions=config.n_positions,
    )


def create_position_codes(n_pos, dim, out):
    position_enc = np.array(
        [
            [pos / np.power(10000, 2 * j / dim) for j in range(dim // 2)]
            for pos in range(n_pos)
        ]
    )
    out.detach_()
    out.requires_grad = False
    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc)).type_as(out)
    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc)).type_as(out)


class TransformerResponseWrapper(nn.Module):
    """Transformer response rapper. Pushes input through transformer and MLP"""

    def __init__(self, transformer, hdim):
        super(TransformerResponseWrapper, self).__init__()
        dim = transformer.out_dim
        self.transformer = transformer
        self.mlp = nn.Sequential(nn.Linear(dim, hdim), nn.ReLU(), nn.Linear(hdim, dim))

    def forward(self, *args):
        return self.mlp(self.transformer(*args))


[docs]class TransformerEncoder(nn.Module):
[docs]    def __init__(
        self,
        n_heads,
        n_layers,
        embedding_size,
        ffn_size,
        vocabulary_size,
        embedding=None,
        dropout=0.0,
        attention_dropout=0.0,
        relu_dropout=0.0,
        padding_idx=0,
        learn_positional_embeddings=False,
        embeddings_scale=False,
        reduction=True,
        n_positions=1024,
    ):
        """Transformer encoder module.

        Args:
            n_heads (int): the number of multihead attention heads.
            n_layers (int): number of transformer layers.
            embedding_size (int): the embedding sizes. Must be a multiple of n_heads.
            ffn_size (int): the size of the hidden layer in the FFN
            vocabulary_size (int): size of the vocabulary
            embedding (nn.Embedding): an embedding matrix for the bottom layer of the transformer.
                If none, one is created for this encoder.
            dropout (float): Dropout used around embeddings and before layer
                layer normalizations. This is used in Vaswani 2017 and works well on
                large datasets.
            attention_dropout (float): Dropout performed after the multhead attention
                softmax. This is not used in Vaswani 2017.
            relu_dropout (float): Dropout used after the ReLU in the FFN. Not used
                in Vaswani 2017, but used in Tensor2Tensor.
            embeddings_scale (bool): Scale embeddings relative to their dimensionality.
                Found useful in fairseq.
            learn_positional_embeddings (bool): If off, sinusoidal embeddings are
                used. If on, position embeddings are learned from scratch.
            padding_idx (int): Reserved padding index in the embeddings matrix.
            n_positions (int): Size of the position embeddings matrix.
        """
        super(TransformerEncoder, self).__init__()

        self.embedding_size = embedding_size
        self.ffn_size = ffn_size
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.dim = embedding_size
        self.embeddings_scale = embeddings_scale
        self.reduction = reduction
        self.padding_idx = padding_idx
        # this is --dropout, not --relu-dropout or --attention-dropout
        self.dropout = nn.Dropout(p=dropout)

        self.out_dim = embedding_size
        assert (
            embedding_size % n_heads == 0
        ), "Transformer embedding size must be a multiple of n_heads"

        # check input formats:
        if embedding is not None:
            assert (
                embedding_size is None or embedding_size == embedding.weight.shape[1]
            ), "Embedding dim must match the embedding size."

        if embedding is not None:
            self.embeddings = embedding
        else:
            assert False
            assert padding_idx is not None
            self.embeddings = nn.Embedding(
                vocabulary_size, embedding_size, padding_idx=padding_idx
            )
            nn.init.normal_(self.embeddings.weight, 0, embedding_size**-0.5)

        # create the positional embeddings
        self.position_embeddings = nn.Embedding(n_positions, embedding_size)
        if not learn_positional_embeddings:
            create_position_codes(
                n_positions, embedding_size, out=self.position_embeddings.weight
            )
        else:
            nn.init.normal_(self.position_embeddings.weight, 0, embedding_size**-0.5)

        # build the model
        self.layers = nn.ModuleList()
        for _ in range(self.n_layers):
            self.layers.append(
                TransformerEncoderLayer(
                    n_heads,
                    embedding_size,
                    ffn_size,
                    attention_dropout=attention_dropout,
                    relu_dropout=relu_dropout,
                    dropout=dropout,
                )
            )

[docs]    def forward(self, input):
        """
        input data is a FloatTensor of shape [batch, seq_len, dim]
        mask is a ByteTensor of shape [batch, seq_len], filled with 1 when
        inside the sequence and 0 outside.
        """
        mask = input != self.padding_idx
        positions = (mask.cumsum(dim=1, dtype=torch.int64) - 1).clamp_(min=0)
        tensor = self.embeddings(input)
        if self.embeddings_scale:
            tensor = tensor * np.sqrt(self.dim)
        tensor = tensor + self.position_embeddings(positions).expand_as(tensor)
        # --dropout on the embeddings
        tensor = self.dropout(tensor)

        tensor *= mask.unsqueeze(-1).type_as(tensor)
        for i in range(self.n_layers):
            tensor = self.layers[i](tensor, mask)

        if self.reduction:
            divisor = mask.type_as(tensor).sum(dim=1).unsqueeze(-1).clamp(min=1e-7)
            output = tensor.sum(dim=1) / divisor
            return output
        else:
            output = tensor
            return output, mask


[docs]class TransformerEncoderLayer(nn.Module):
[docs]    def __init__(
        self,
        n_heads,
        embedding_size,
        ffn_size,
        attention_dropout=0.0,
        relu_dropout=0.0,
        dropout=0.0,
    ):
        super().__init__()
        self.dim = embedding_size
        self.ffn_dim = ffn_size
        self.attention = MultiHeadAttention(
            n_heads,
            embedding_size,
            dropout=attention_dropout,  # --attention-dropout
        )
        self.norm1 = nn.LayerNorm(embedding_size)
        self.ffn = TransformerFFN(embedding_size, ffn_size, relu_dropout=relu_dropout)
        self.norm2 = nn.LayerNorm(embedding_size)
        self.dropout = nn.Dropout(p=dropout)

[docs]    def forward(self, tensor, mask):
        tensor = tensor + self.dropout(self.attention(tensor, mask=mask))
        tensor = _normalize(tensor, self.norm1)
        tensor = tensor + self.dropout(self.ffn(tensor))
        tensor = _normalize(tensor, self.norm2)
        tensor *= mask.unsqueeze(-1).type_as(tensor)
        return tensor


[docs]class TransformerDecoder(nn.Module):
[docs]    def __init__(
        self,
        n_heads,
        n_layers,
        embedding_size,
        ffn_size,
        vocabulary_size,
        embedding=None,
        dropout=0.0,
        attention_dropout=0.0,
        relu_dropout=0.0,
        embeddings_scale=True,
        learn_positional_embeddings=False,
        padding_idx=None,
        n_positions=1024,
    ):
        """Transformer Decoder layer.

        Args:
            n_heads (int): the number of multihead attention heads.
            n_layers (int): number of transformer layers.
            embedding_size (int): the embedding sizes. Must be a multiple of n_heads.
            ffn_size (int): the size of the hidden layer in the FFN
            vocabulary_size (int): size of the vocabulary
            embedding (nn.Embedding): an embedding matrix for the bottom layer of the transformer.
                If none, one is created for this encoder.
            dropout (float): Dropout used around embeddings and before layer
                layer normalizations. This is used in Vaswani 2017 and works well on
                large datasets.
            attention_dropout (float): Dropout performed after the multhead attention
                softmax. This is not used in Vaswani 2017.
            relu_dropout (float): Dropout used after the ReLU in the FFN. Not used
                in Vaswani 2017, but used in Tensor2Tensor.
            embeddings_scale (bool): Scale embeddings relative to their dimensionality.
                Found useful in fairseq.
            learn_positional_embeddings (bool): If off, sinusoidal embeddings are
                used. If on, position embeddings are learned from scratch.
            padding_idx (int): Reserved padding index in the embeddings matrix.
            n_positions (int): Size of the position embeddings matrix.
        """

        super().__init__()
        self.embedding_size = embedding_size
        self.ffn_size = ffn_size
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.dim = embedding_size
        self.embeddings_scale = embeddings_scale
        self.dropout = nn.Dropout(p=dropout)  # --dropout

        self.out_dim = embedding_size
        assert (
            embedding_size % n_heads == 0
        ), "Transformer embedding size must be a multiple of n_heads"

        self.embeddings = embedding

        # create the positional embeddings
        self.position_embeddings = nn.Embedding(n_positions, embedding_size)
        if not learn_positional_embeddings:
            create_position_codes(
                n_positions, embedding_size, out=self.position_embeddings.weight
            )
        else:
            nn.init.normal_(self.position_embeddings.weight, 0, embedding_size**-0.5)

        # build the model
        self.layers = nn.ModuleList()
        for _ in range(self.n_layers):
            self.layers.append(
                TransformerDecoderLayer(
                    n_heads,
                    embedding_size,
                    ffn_size,
                    attention_dropout=attention_dropout,
                    relu_dropout=relu_dropout,
                    dropout=dropout,
                )
            )

[docs]    def forward(self, input, encoder_state, incr_state=None):
        encoder_output, encoder_mask = encoder_state

        seq_len = input.size(1)
        positions = input.new(seq_len).long()
        positions = torch.arange(seq_len, out=positions).unsqueeze(0)
        tensor = self.embeddings(input)
        if self.embeddings_scale:
            tensor = tensor * np.sqrt(self.dim)
        tensor = tensor + self.position_embeddings(positions).expand_as(tensor)
        tensor = self.dropout(tensor)  # --dropout

        for layer in self.layers:
            tensor = layer(tensor, encoder_output, encoder_mask)

        return tensor, None


[docs]class TransformerDecoderLayer(nn.Module):
[docs]    def __init__(
        self,
        n_heads,
        embedding_size,
        ffn_size,
        attention_dropout=0.0,
        relu_dropout=0.0,
        dropout=0.0,
    ):
        super().__init__()
        self.dim = embedding_size
        self.ffn_dim = ffn_size
        self.dropout = nn.Dropout(p=dropout)

        self.self_attention = MultiHeadAttention(
            n_heads, embedding_size, dropout=attention_dropout
        )
        self.norm1 = nn.LayerNorm(embedding_size)

        self.encoder_attention = MultiHeadAttention(
            n_heads, embedding_size, dropout=attention_dropout
        )
        self.norm2 = nn.LayerNorm(embedding_size)

        self.ffn = TransformerFFN(embedding_size, ffn_size, relu_dropout=relu_dropout)
        self.norm3 = nn.LayerNorm(embedding_size)

[docs]    def forward(self, x, encoder_output, encoder_mask):
        decoder_mask = self._create_selfattn_mask(x)
        # first self attn
        residual = x
        # don't peak into the future!
        x = self.self_attention(query=x, mask=decoder_mask)
        x = self.dropout(x)  # --dropout
        x = x + residual
        x = _normalize(x, self.norm1)

        residual = x
        x = self.encoder_attention(
            query=x, key=encoder_output, value=encoder_output, mask=encoder_mask
        )
        x = self.dropout(x)  # --dropout
        x = residual + x
        x = _normalize(x, self.norm2)

        # finally the ffn
        residual = x
        x = self.ffn(x)
        x = self.dropout(x)  # --dropout
        x = residual + x
        x = _normalize(x, self.norm3)

        return x

    def _create_selfattn_mask(self, x):
        # figure out how many timestamps we need
        bsz = x.size(0)
        time = x.size(1)
        # make sure that we don't look into the future
        mask = torch.tril(x.new(time, time).fill_(1))
        # broadcast across batch
        mask = mask.unsqueeze(0).expand(bsz, -1, -1)
        return mask


[docs]class TransformerGeneratorModel(TorchGeneratorModel):
[docs]    def __init__(self, config, kbrd_rec):
        self.pad_idx = config.pad_idx
        self.start_idx = config.start_idx
        self.end_idx = config.end_idx
        super().__init__(self.pad_idx, self.start_idx, self.end_idx)
        self.embeddings = _create_embeddings(
            config.vocab_size, config.gen_dim, self.pad_idx
        )

        self.encoder = _build_encoder(
            config,
            self.embeddings,
            self.pad_idx,
            reduction=False,
        )
        self.decoder = _build_decoder(
            config,
            self.embeddings,
            self.pad_idx,
        )

        self.user_representation_to_bias_1 = nn.Linear(config.rec_dim, 512)
        self.user_representation_to_bias_2 = nn.Linear(512, config.vocab_size)

        self.kbrd = kbrd_rec
        for param in self.kbrd.parameters():
            param.requires_grad = False

[docs]    def reorder_encoder_states(self, encoder_states, indices):
        enc, mask = encoder_states
        if not torch.is_tensor(indices):
            indices = torch.LongTensor(indices).to(enc.device)
        enc = torch.index_select(enc, 0, indices)
        mask = torch.index_select(mask, 0, indices)
        return enc, mask

    def reorder_decoder_incremental_state(self, incremental_state, inds):
        # no support for incremental decoding at this time
        return None

    def output(self, tensor):
        # project back to vocabulary
        output = F.linear(tensor, self.embeddings.weight)
        if hasattr(self, "user_representation"):
            up_bias = self.user_representation_to_bias_2(
                F.relu(self.user_representation_to_bias_1(self.user_representation))
            )
            # Expand to the whole sequence
            up_bias = up_bias.unsqueeze(dim=1)
            output += up_bias
        return output


[docs]class BasicAttention(nn.Module):
[docs]    def __init__(self, dim=1, attn="cosine"):
        super().__init__()
        self.softmax = nn.Softmax(dim=dim)
        if attn == "cosine":
            self.cosine = nn.CosineSimilarity(dim=dim)
        self.attn = attn
        self.dim = dim

[docs]    def forward(self, xs, ys):
        if self.attn == "cosine":
            l1 = self.cosine(xs, ys).unsqueeze(self.dim - 1)
        else:
            l1 = torch.bmm(xs, ys.transpose(1, 2))
            if self.attn == "sqrt":
                d_k = ys.size(-1)
                l1 = l1 / math.sqrt(d_k)
        l2 = self.softmax(l1)
        lhs_emb = torch.bmm(l2, ys)
        # add back the query
        lhs_emb = lhs_emb.add(xs)

        return lhs_emb.squeeze(self.dim - 1), l2


[docs]class MultiHeadAttention(nn.Module):
[docs]    def __init__(self, n_heads, dim, dropout=0):
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.dim = dim

        self.attn_dropout = nn.Dropout(p=dropout)  # --attention-dropout
        self.q_lin = nn.Linear(dim, dim)
        self.k_lin = nn.Linear(dim, dim)
        self.v_lin = nn.Linear(dim, dim)
        # TODO: merge for the initialization step
        nn.init.xavier_normal_(self.q_lin.weight)
        nn.init.xavier_normal_(self.k_lin.weight)
        nn.init.xavier_normal_(self.v_lin.weight)
        # and set biases to 0
        self.out_lin = nn.Linear(dim, dim)

        nn.init.xavier_normal_(self.out_lin.weight)

[docs]    def forward(self, query, key=None, value=None, mask=None):
        # Input is [B, query_len, dim]
        # Mask is [B, key_len] (selfattn) or [B, key_len, key_len] (enc attn)
        batch_size, query_len, dim = query.size()
        assert (
            dim == self.dim
        ), f"Dimensions do not match: {dim} query vs {self.dim} configured"
        assert mask is not None, "Mask is None, please specify a mask"
        n_heads = self.n_heads
        dim_per_head = dim // n_heads
        scale = math.sqrt(dim_per_head)

        def prepare_head(tensor):
            # input is [batch_size, seq_len, n_heads * dim_per_head]
            # output is [batch_size * n_heads, seq_len, dim_per_head]
            bsz, seq_len, _ = tensor.size()
            tensor = tensor.view(batch_size, tensor.size(1), n_heads, dim_per_head)
            tensor = (
                tensor.transpose(1, 2)
                .contiguous()
                .view(batch_size * n_heads, seq_len, dim_per_head)
            )
            return tensor

        # q, k, v are the transformed values
        if key is None and value is None:
            # self attention
            key = value = query
        elif value is None:
            # key and value are the same, but query differs
            # self attention
            value = key
        _, key_len, dim = key.size()

        q = prepare_head(self.q_lin(query))
        k = prepare_head(self.k_lin(key))
        v = prepare_head(self.v_lin(value))

        dot_prod = q.div_(scale).bmm(k.transpose(1, 2))
        # [B * n_heads, query_len, key_len]
        attn_mask = (
            (mask == 0)
            .view(batch_size, 1, -1, key_len)
            .repeat(1, n_heads, 1, 1)
            .expand(batch_size, n_heads, query_len, key_len)
            .view(batch_size * n_heads, query_len, key_len)
        )
        assert attn_mask.shape == dot_prod.shape
        dot_prod.masked_fill_(
            attn_mask, -torch.finfo(dot_prod.dtype).max
        )  # TODO: previous implementation is neginf, check if they are equivalent

        attn_weights = F.softmax(dot_prod, dim=-1).type_as(query)
        attn_weights = self.attn_dropout(attn_weights)  # --attention-dropout

        attentioned = attn_weights.bmm(v)
        attentioned = (
            attentioned.type_as(query)
            .view(batch_size, n_heads, query_len, dim_per_head)
            .transpose(1, 2)
            .contiguous()
            .view(batch_size, query_len, dim)
        )

        out = self.out_lin(attentioned)

        return out


[docs]class TransformerFFN(nn.Module):
[docs]    def __init__(self, dim, dim_hidden, relu_dropout=0):
        super(TransformerFFN, self).__init__()
        self.relu_dropout = nn.Dropout(p=relu_dropout)
        self.lin1 = nn.Linear(dim, dim_hidden)
        self.lin2 = nn.Linear(dim_hidden, dim)
        nn.init.xavier_uniform_(self.lin1.weight)
        nn.init.xavier_uniform_(self.lin2.weight)

[docs]    def forward(self, x):
        x = F.relu(self.lin1(x))
        x = self.relu_dropout(x)  # --relu-dropout
        x = self.lin2(x)
        return x