I have overridden the keras.fit
function and added some tensorboard utility functions to it. Everything is almost fine except the RAM usage.
when I run my code in kaggle and keep watching the RAM usage, I can see that during training, apparently my data generator which is working with tf.data.Dataset
is accumulating the data (I guess) and does not remove the previous data in RAM.
I only include the last cell of notebook which is the training and validation process:
AUTOTUNE = tf.data.experimental.AUTOTUNE
pattern = train_path + '*/*.jp*g'
train_ds, val_ds = generator(pattern=pattern, validation_ratio=0.2)
train_size = tf.data.experimental.cardinality(train_ds).numpy()
val_size = tf.data.experimental.cardinality(train_ds).numpy()
batch_size = 32
n_epochs = 10
val_ds = configure_for_performance(val_ds, AUTOTUNE, batch_size)
# Tensorboard utils initialization.
model_name = 'xception'
path_to_run = generate_path(join(ROOT, 'runs'), last_run=True)
tb_train_path = join(path_to_run, 'logs','train')
tb_test_path = join(path_to_run, 'logs', 'test')
train_writer = tf.summary.create_file_writer(tb_train_path)
test_writer = tf.summary.create_file_writer(tb_test_path)
train_step = test_step = 0
# Configure model, loss function and accuracy metrics.
blocks_to_train = []
lr = 1e-4
optimizer = SGD(lr=lr, decay=1e-6,momentum=0.9,nesterov=True) # Define optimizer
# optimizer = Adam(learning_rate=0.0001)
model = create_model(optimizer, name=model_name, include_compile=False,blocks_to_train=blocks_to_train)
metrics = {'acc': 0.0, 'loss': 0.0, 'val_acc': 0.0, 'val_loss': 0.0, 'lr': lr}
train_loss = tf.keras.losses.categorical_crossentropy # define loss function
val_loss = tf.keras.losses.categorical_crossentropy # define loss function
train_acc = tf.keras.metrics.categorical_accuracy # define accuracy metric
val_acc = tf.keras.metrics.categorical_accuracy # define accuracy metric
train_loss_tracker = tf.keras.metrics.Mean()
val_loss_tracker = tf.keras.metrics.Mean()
train_acc_tracker = tf.keras.metrics.Mean()
val_acc_tracker = tf.keras.metrics.Mean()
for epoch in range(n_epochs):
# Iterate through the training set
confusion = np.zeros((len(labels), len(labels)))
train_loss_tracker.reset_states()
train_acc_tracker.reset_states()
print("\epoch {}/{}".format(epoch+1,n_epochs))
progress_bar = Progbar(train_size, stateful_metrics=list(metrics.keys()))
train_ds = train_ds.shuffle(train_size)
train_gen = train_ds.take(train_size)
train_gen = train_gen.map(_augment_function, num_parallel_calls=AUTOTUNE)
train_gen = configure_for_performance(train_gen, AUTOTUNE, batch_size)
for batch_idx, (x, y) in enumerate(train_gen):
with tf.GradientTape() as tape:
y_pred = model(x, training=True)
loss = train_loss(y, y_pred)
gradients = tape.gradient(loss, model.trainable_weights)
optimizer.apply_gradients(zip(gradients, model.trainable_weights))
train_loss_tracker.update_state(loss)
acc = train_acc(y, y_pred)
train_acc_tracker.update_state(acc)
train_step += 1
progress_bar.update(batch_idx*batch_size, values=[('acc',train_acc_tracker.result()),
('loss', train_loss_tracker.result())])
with train_writer.as_default():
tf.summary.scalar("Loss", train_loss_tracker.result(), step=epoch)
tf.summary.scalar(
"Accuracy", train_acc_tracker.result(), step=epoch
)
# reset accuracy between epochs (and for testing and test)
"""
########################################################
##################### Validation #####################
########################################################
"""
val_loss_tracker.reset_states()
val_acc_tracker.reset_states()
val_gen = val_ds.take(val_size)
val_gen = configure_for_performance(val_gen, AUTOTUNE, batch_size)
# train_gen = configure_for_performance(train_gen, AUTOTUNE, batch_size)
for batch_idx, (x,y) in enumerate(val_ds):
figure = image_grid(x.numpy(),y.numpy(), class_names=list(labels.keys()))
y_pred = model(x, training=False)
loss = val_loss(y, y_pred)
val_loss_tracker.update_state(loss)
acc = val_acc(y, y_pred)
val_acc_tracker.update_state(acc)
confusion += get_confusion_matrix(y, y_pred, class_names=list(labels.keys()))
with test_writer.as_default():
tf.summary.scalar("Loss", val_loss_tracker.result(), step=epoch)
tf.summary.scalar("Accuracy", val_acc_tracker.result(), step=epoch)
tf.summary.image('Confusion Matrix',
plot_confusion_matrix(confusion / batch_idx,
class_names=list(labels.keys())), step=epoch)
tf.summary.image("Visualize images",plot_to_image(figure), step=epoch)
progress_bar.update(train_size, values=[('val_acc', val_acc_tracker.result()), ('val_loss', val_loss_tracker.result())])
# reset accuracy between epochs (and for testing and test)
So what is the issue with the code? What should I possibly do to stabilize the RAM usage?
from keras cusom fit function crashes when there is no RAM left on the server
No comments:
Post a Comment