Tuesday, 18 May 2021

How to monitor accuracy with CTC loss function and Datasets? (runnable code included)

I've been trying to speed up training of my CRNN network for optical character recognition, but I can't get the accuracy metric working when using TFRecords and tf.data.Dataset pipelines. I previously used a Keras Sequence and had it working. Here is a complete runnable toy example showing my problem (tested with Tensorflow 2.4.1):

import random
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.python.keras import Input, Model
from tensorflow.python.keras.layers import Dense, Layer, Bidirectional, GRU, Reshape, Activation
from tensorflow.python.keras.optimizer_v2.adam import Adam

AUTOTUNE = tf.data.experimental.AUTOTUNE
CHAR_VECTOR = "ABC"
IMG_W = 10
IMG_H = 10
N_CHANNELS = 3


class CTCLayer(Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = K.ctc_batch_cost

    def call(self, y_true, y_pred, label_length):
        # Compute the training-time loss value and add it
        # to the layer using `self.add_loss()`.
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")

        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        # At test time, just return the computed predictions
        return y_pred


def get_model():
    n_classes = len(CHAR_VECTOR) + 1

    input = Input(name='image', shape=(IMG_W, IMG_H, N_CHANNELS), dtype='float32')
    label = Input(name='label', shape=[None], dtype='float32')
    label_length = Input(name='label_length', shape=[None], dtype='int64')

    x = Reshape(target_shape=(IMG_W, np.prod(input.shape[2:])), name='reshape')(input)
    x = Dense(24, activation='relu', name='dense1')(x)
    x = Bidirectional(GRU(24, return_sequences=True, name="GRU"), merge_mode="sum")(x)
    x = Dense(n_classes, name='dense2')(x)
    y_pred = Activation('softmax', name='softmax')(x)

    output = CTCLayer(name="ctc")(label, y_pred, label_length)

    m = Model(inputs=[input, label, label_length], outputs=output)
    return m


def image_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.encode_jpeg(value).numpy()]))


def float_feature_list(value):
    """Returns a list of float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


def int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def create_example(image, label, label_length):
    feature = {
        "image": image_feature(image),
        "label": float_feature_list(label),
        "label_length": int64_feature(label_length),
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))


def parse_tfrecord_fn(example):
    feature_description = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.VarLenFeature(tf.float32),
        "label_length": tf.io.FixedLenFeature([1], tf.int64),
    }
    example = tf.io.parse_single_example(example, feature_description)
    example["image"] = tf.image.convert_image_dtype(tf.io.decode_jpeg(example["image"], channels=3), dtype="float32")
    example["label"] = tf.sparse.to_dense(example["label"])

    return example


def generate_tfrecords(n):
    with tf.io.TFRecordWriter(filename) as writer:
        for i in range(n):
            random_img = np.random.random((IMG_W, IMG_H, N_CHANNELS))
            label_length = random.randint(1, max_text_len)
            label = np.random.randint(0, len(CHAR_VECTOR), max_text_len)
            example = create_example(random_img, label, label_length)
            writer.write(example.SerializeToString())


class DataGenerator(tf.keras.utils.Sequence):
    def __len__(self):
        return steps_per_epoch

    def __getitem__(self, index):
        outputs = np.zeros([batch_size])
        dataset = get_dataset()
        inputs = next(iter(dataset.take(1)))
        return inputs, outputs


def get_dataset():
    generate_tfrecords(batch_size * epochs * steps_per_epoch)
    dataset = (
        tf.data.TFRecordDataset(filename, num_parallel_reads=AUTOTUNE)
        .map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
        .batch(batch_size)
        .prefetch(AUTOTUNE)
    )
    return dataset


if __name__ == "__main__":
    batch_size = 9
    epochs = 7
    steps_per_epoch = 8
    max_text_len = 5
    filename = "test.tfrec"
    use_generator = False
    data = DataGenerator() if use_generator else get_dataset()

    model = get_model()
    '''This fails when use_generator == False, removing the 
     metric solves it'''
    model.compile(optimizer=Adam(), metrics=["accuracy"])
    model.fit(data, epochs=epochs, steps_per_epoch=steps_per_epoch)

Set use_generator = False or remove metrics=["accuracy"] and it will run without error.

As you can see the DataGenerator uses the same data from the TFRecords, but it also returns some zeros, and for whatever reason this seems to be the magic sauce:

class DataGenerator(tf.keras.utils.Sequence):
    def __len__(self):
        return steps_per_epoch

    def __getitem__(self, index):
        outputs = np.zeros([batch_size])
        dataset = get_dataset()
        inputs = next(iter(dataset.take(1)))
        return inputs, outputs

I also noticed that this Keras example suffers from the same problem (it crashes if you edit the code to monitor accuracy): https://keras.io/examples/vision/captcha_ocr/

Is there any way to mimic the behaviour of __getitem__ with the Dataset, or some other way of getting the accuracy without using a Sequence?



from How to monitor accuracy with CTC loss function and Datasets? (runnable code included)

No comments:

Post a Comment