Saturday 6 March 2021

keras cusom fit function crashes when there is no RAM left on the server

I have overridden the keras.fit function and added some tensorboard utility functions to it. Everything is almost fine except the RAM usage.

when I run my code in kaggle and keep watching the RAM usage, I can see that during training, apparently my data generator which is working with tf.data.Dataset is accumulating the data (I guess) and does not remove the previous data in RAM.

I only include the last cell of notebook which is the training and validation process:

AUTOTUNE = tf.data.experimental.AUTOTUNE

pattern = train_path + '*/*.jp*g'
train_ds, val_ds = generator(pattern=pattern, validation_ratio=0.2)

train_size = tf.data.experimental.cardinality(train_ds).numpy()
val_size = tf.data.experimental.cardinality(train_ds).numpy()

batch_size = 32
n_epochs = 10

val_ds = configure_for_performance(val_ds, AUTOTUNE, batch_size)

# Tensorboard utils initialization.
model_name = 'xception'
path_to_run = generate_path(join(ROOT, 'runs'), last_run=True)

tb_train_path = join(path_to_run, 'logs','train')
tb_test_path = join(path_to_run, 'logs', 'test')

train_writer = tf.summary.create_file_writer(tb_train_path)
test_writer = tf.summary.create_file_writer(tb_test_path)
train_step = test_step = 0

# Configure model, loss function and accuracy metrics.

blocks_to_train = []
lr = 1e-4
optimizer = SGD(lr=lr, decay=1e-6,momentum=0.9,nesterov=True) # Define optimizer
# optimizer = Adam(learning_rate=0.0001)

model = create_model(optimizer, name=model_name, include_compile=False,blocks_to_train=blocks_to_train)

metrics = {'acc': 0.0, 'loss': 0.0, 'val_acc': 0.0, 'val_loss': 0.0, 'lr': lr}

train_loss = tf.keras.losses.categorical_crossentropy   # define loss function
val_loss = tf.keras.losses.categorical_crossentropy # define loss function

train_acc = tf.keras.metrics.categorical_accuracy # define accuracy metric
val_acc = tf.keras.metrics.categorical_accuracy # define accuracy metric

train_loss_tracker = tf.keras.metrics.Mean()
val_loss_tracker = tf.keras.metrics.Mean()

train_acc_tracker = tf.keras.metrics.Mean()
val_acc_tracker = tf.keras.metrics.Mean()

for epoch in range(n_epochs):
    # Iterate through the training set
    confusion = np.zeros((len(labels), len(labels)))

    train_loss_tracker.reset_states()
    train_acc_tracker.reset_states()


    print("\epoch {}/{}".format(epoch+1,n_epochs))
    progress_bar = Progbar(train_size, stateful_metrics=list(metrics.keys()))
    train_ds = train_ds.shuffle(train_size)
    train_gen = train_ds.take(train_size)
    train_gen = train_gen.map(_augment_function, num_parallel_calls=AUTOTUNE)
    train_gen = configure_for_performance(train_gen, AUTOTUNE, batch_size)

    for batch_idx, (x, y) in enumerate(train_gen):
        with tf.GradientTape() as tape:
            y_pred = model(x, training=True)
            loss = train_loss(y, y_pred)

        gradients = tape.gradient(loss, model.trainable_weights)
        optimizer.apply_gradients(zip(gradients, model.trainable_weights))
        train_loss_tracker.update_state(loss)

        acc = train_acc(y, y_pred) 
        train_acc_tracker.update_state(acc)
        train_step += 1
        progress_bar.update(batch_idx*batch_size, values=[('acc',train_acc_tracker.result()), 
                                       ('loss', train_loss_tracker.result())])

    with train_writer.as_default():
        tf.summary.scalar("Loss", train_loss_tracker.result(), step=epoch)
        tf.summary.scalar(
            "Accuracy", train_acc_tracker.result(), step=epoch
        )

    # reset accuracy between epochs (and for testing and test)


    """ 
    ########################################################
    #####################  Validation  #####################
    ########################################################
    """


    val_loss_tracker.reset_states()
    val_acc_tracker.reset_states()

    val_gen = val_ds.take(val_size)
    val_gen = configure_for_performance(val_gen, AUTOTUNE, batch_size)
    # train_gen = configure_for_performance(train_gen, AUTOTUNE, batch_size)

    for batch_idx, (x,y) in enumerate(val_ds):
        figure = image_grid(x.numpy(),y.numpy(), class_names=list(labels.keys()))
        y_pred = model(x, training=False)

        loss = val_loss(y, y_pred)
        val_loss_tracker.update_state(loss)

        acc = val_acc(y, y_pred) 
        val_acc_tracker.update_state(acc)
        confusion += get_confusion_matrix(y, y_pred, class_names=list(labels.keys()))

    with test_writer.as_default():
        tf.summary.scalar("Loss", val_loss_tracker.result(), step=epoch)
        tf.summary.scalar("Accuracy", val_acc_tracker.result(), step=epoch)
        tf.summary.image('Confusion Matrix',
                        plot_confusion_matrix(confusion / batch_idx,
                                            class_names=list(labels.keys())), step=epoch)
        tf.summary.image("Visualize images",plot_to_image(figure), step=epoch)

    progress_bar.update(train_size, values=[('val_acc', val_acc_tracker.result()), ('val_loss', val_loss_tracker.result())])

    # reset accuracy between epochs (and for testing and test)

So what is the issue with the code? What should I possibly do to stabilize the RAM usage?



from keras cusom fit function crashes when there is no RAM left on the server

No comments:

Post a Comment