Thursday, 17 December 2020

Where should we put attention in an autoencoder?

In this tutorial in tensorflow site we can see a code for the implementation of an autoencoder which it's Decoder is as follows:

class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

the BahdanauAttention is applied to the output of the encoder and previous hidden state then it is concated with the lookup of the input and then is fed to GRU.

And yet in another code from this github repository (which is implemented using pytorch) the attention is applied to the output of the GRU:

class DecoderAttn(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, out_bias):
        super(DecoderAttn, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.emb_drop = nn.Dropout(0.2)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.gru_drop = nn.Dropout(0.2)
        self.mlp = nn.Linear(hidden_size*2, output_size)
        if out_bias is not None:
            out_bias_tensor = torch.tensor(out_bias, requires_grad=False)
            self.mlp.bias.data[:] = out_bias_tensor
        self.logsoftmax = nn.LogSoftmax(dim=2)
        
        self.att_mlp = nn.Linear(hidden_size, hidden_size, bias=False)
        self.attn_softmax = nn.Softmax(dim=2)
    
    def forward(self, input, hidden, encoder_outs):
        emb = self.embedding(input)
        out, hidden = self.gru(self.emb_drop(emb), hidden)
        
        out_proj = self.att_mlp(out)
        enc_out_perm = encoder_outs.permute(0, 2, 1)
        e_exp = torch.bmm(out_proj, enc_out_perm)
        attn = self.attn_softmax(e_exp)
        
        ctx = torch.bmm(attn, encoder_outs)
        
        full_ctx = torch.cat([self.gru_drop(out), ctx], dim=2)
        
        out = self.mlp(full_ctx)
        out = self.logsoftmax(out)
        return out, hidden, attn

I wonder if the second case is a mistake? If it is not a mistake what is the difference between it and the first decoder? How changing the attention place affect the output?



from Where should we put attention in an autoencoder?

No comments:

Post a Comment