I am currently trying to get a hold of the TF2.0 api, but as I compared the GradientTape to a regular keras.Model.fit I noticed:
-
It ran slower(probably due to the Eager Execution)
-
It converged much slower (and I am not sure why).
+--------+--------------+------------------+
| Epoch | GradientTape | keras.Model.fit |
+--------+--------------+------------------+
| 1 | 0.905 | 0.8793 |
+--------+--------------+------------------+
| 2 | 0.352 | 0.2226 |
+--------+--------------+------------------+
| 3 | 0.285 | 0.1192 |
+--------+--------------+------------------+
| 4 | 0.282 | 0.1029 |
+--------+--------------+------------------+
| 5 | 0.275 | 0.0940 |
+--------+--------------+------------------+
Here is the training loop I used with the GradientTape:
optimizer = keras.optimizers.Adam()
glove_model = GloveModel(vocab_size=len(labels))
train_loss = keras.metrics.Mean(name='train_loss')
@tf.function
def train_step(examples, labels):
with tf.GradientTape() as tape:
predictions = glove_model(examples)
loss = glove_model.glove_loss(labels, predictions)
gradients = tape.gradient(loss, glove_model.trainable_variables)
optimizer.apply_gradients(zip(gradients, glove_model.trainable_variables))
train_loss(loss)
total_step = 0
for epoch in range(epochs_number):
pbar = tqdm(train_ds.enumerate(), total=int(len(index_data) / batch_size) + 1)
for ix, (examples, labels) in pbar:
train_step(examples, labels)
print(f"Epoch {epoch + 1}, Loss {train_loss.result()}")
# Reset the metrics for the next epoch
train_loss.reset_states()
And here is the Keras.Model.fit training:
glove_model.compile(optimizer, glove_model.glove_loss)
glove_model.fit(train_ds, epochs=epochs_number)
And Here is the model.
class GloveModel(keras.Model):
def __init__(self, vocab_size, dim=100, a=3/4, x_max=100):
super(GloveModel, self).__init__()
self.vocab_size = vocab_size
self.dim = dim
self.a = a
self.x_max = x_max
self.target_embedding = layers.Embedding(
input_dim=self.vocab_size, output_dim=self.dim, input_length=1, name="target_embedding"
)
self.target_bias = layers.Embedding(
input_dim=self.vocab_size, output_dim=1, input_length=1, name="target_bias"
)
self.context_embedding = layers.Embedding(
input_dim=self.vocab_size, output_dim=self.dim, input_length=1, name="context_embedding"
)
self.context_bias = layers.Embedding(
input_dim=self.vocab_size, output_dim=1, input_length=1, name="context_bias"
)
self.dot_product = layers.Dot(axes=-1, name="dot")
self.prediction = layers.Add(name="add")
self.step = 0
def call(self, inputs):
target_ix = inputs[:, 0]
context_ix = inputs[:, 1]
target_embedding = self.target_embedding(target_ix)
target_bias = self.target_bias(target_ix)
context_embedding = self.context_embedding(context_ix)
context_bias = self.context_bias(context_ix)
dot_product = self.dot_product([target_embedding, context_embedding])
prediction = self.prediction([dot_product, target_bias, context_bias])
return prediction
def glove_loss(self, y_true, y_pred):
weight = tf.math.minimum(
tf.math.pow(y_true/self.x_max, self.a), 1.0
)
loss_value = tf.math.reduce_mean(weight * tf.math.pow(y_pred - tf.math.log(y_true), 2.0))
return loss_value
I tried multiple configurations and optimizers but nothing seems to change the convergence rate.
from GradienTape convergence much slower than Keras.model.fit
No comments:
Post a Comment