Wednesday 9 December 2020

How to use tensorflow.data to generate sequences of frames for C3D networks?

My task is to detect videos that contain violence. My dataset includes 2000 videos in 2 categories of violent videos or nonviolent videos. This is the code that creates the model and loads the weight:

# Main repo: https://github.com/eazydammy/violence-detection-with-C3D

import skvideo.io
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv3D, MaxPooling3D, ZeroPadding3D
from sports1M_utils import preprocess_input, decode_predictions
import cv2
import matplotlib.pyplot as plt

WEIGHTS_PATH = 'https://github.com/adamcasson/c3d/releases/download/v0.1/sports1M_weights_tf.h5'


def C3D(weights='sports1M'):
    """
    Keyword arguments:
    weights -- weights to load into model. (default is sports1M)
    
    Returns:
    A Keras model.
    
    """

    # if weights not in {'sports1M', None}:
    #     raise ValueError('weights should be either be sports1M or None')

    if K.image_data_format() == 'channels_last':
        shape = (16, 112, 112, 3)
    else:
        shape = (3, 16, 112, 112)

    model = Sequential()
    model.add(Conv3D(64, 3, activation='relu', padding='same', name='conv1', input_shape=shape))
    model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), padding='same', name='pool1'))

    model.add(Conv3D(128, 3, activation='relu', padding='same', name='conv2'))
    model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), padding='valid', name='pool2'))

    model.add(Conv3D(256, 3, activation='relu', padding='same', name='conv3a'))
    model.add(Conv3D(256, 3, activation='relu', padding='same', name='conv3b'))
    model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), padding='valid', name='pool3'))

    model.add(Conv3D(512, 3, activation='relu', padding='same', name='conv4a'))
    model.add(Conv3D(512, 3, activation='relu', padding='same', name='conv4b'))
    model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), padding='valid', name='pool4'))

    model.add(Conv3D(512, 3, activation='relu', padding='same', name='conv5a'))
    model.add(Conv3D(512, 3, activation='relu', padding='same', name='conv5b'))
    model.add(ZeroPadding3D(padding=(0, 1, 1)))
    model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), padding='valid', name='pool5'))

    model.add(Flatten())

    model.add(Dense(4096, activation='relu', name='fc6'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu', name='fc7'))
    model.add(Dropout(0.5))
    model.add(Dense(487, activation='softmax', name='fc8'))  # Would not count
    model.load_weights(weights)

    fc7 = model.get_layer('fc7').output
    classifier = Dense(512, activation='relu', name='classifier_512')(fc7)
    classifier = Dropout(0.3)(classifier)
    classifier = Dense(1, activation='sigmoid', name='binary_classifier')(classifier)
    model = Model(inputs=model.input, outputs=classifier)

    for layer in model.layers[:-3]:
        layer.trainable = False

    return model


if __name__ == '__main__':
    model = C3D(weights='weights/sports1M_weights_tf.h5')
    print(model.summary())


    vid_path = 'video/1.mp4'
    vid = skvideo.io.vread(vid_path)
    vid = vid[40:56]
    vid = preprocess_input(vid)
    preds = model.predict(vid)
    print(decode_predictions(preds))

This is the structure of my dataset directories(training directory):

/ fight
    / folder 1
        frame_1.png
        frame_2.png
        ....
        frame_16.png
    / folder 2
    ....
/ nonfight
    / folder 1
        frame_1.png
        frame_2.png
        ....
        frame_16.png
    / folder 2
    ....

The same thing is applied to the validation dataset.

Now I used this code as data generator for training. The model input accepts (16, 16, 112, 112, 3) which stands for (batch_size, number_of_frames_in_seq, width, height, channels). I followed this instruction and adapted the same idea for loading and generating batches of 3D shaped arrays:

# Main repo: https://github.com/eazydammy/violence-detection-with-C3D
import cv2
from glob import glob
import random as rnd
from tensorflow import keras
from tensorflow.keras.utils import Sequence
import os
from pathlib import Path
import numpy as np
from tqdm import tqdm
from natsort import natsorted


class Meta:
    npy_file = '../weights/c3d_mean.npy' # Address: https://github.com/adamcasson/c3d/releases/download/v0.1/c3d_mean.npy


class DataGeneratorV1(Sequence):
    def __init__(self, path="train", batch_size=16, shuffle=False, dim=(16, 112, 112),
                 channels=3):
        assert os.path.isdir(path), "directory does not exist!"
        self.main_path = Path(path) # parent directory to fight/nonfight directories
        self.dim = dim
        self.channels = channels
        self.batch_size = batch_size
        self.shuffle = shuffle
        # get all labels based on the dir names: -> fight | non-fight
        self.labels = {} # keeps labels for sequences
        self.data = {} # keeps key: addresses to images
        self.n_classes = 2

        # Resize based on repo inference code
        self.width = 171
        self.height = 128

        fight_path = self.main_path / 'fight'
        non_fight_path = self.main_path / 'nonfight'

        train_fights = fight_path.glob(f'**') # All folders in fight subdirectory that keep 16 frames each
        train_non_fights = non_fight_path.glob(f'**') # All folders in nonfight subdirectory that keep 16 frames each

        train_fights = [(fight, 1) for fight in train_fights] # assign label 1 to fight sequences
        train_non_fights = [(non_fight, 0) for non_fight in train_non_fights] # assign label 0 to nonfight sequences
        del train_fights[0]  # Contains the main directory which has to be removed.
        del train_non_fights[0]  # Contains the main directory which has to be removed.

        data = train_fights + train_non_fights  # Gathering all dataset as one

        for id_, (sub_dir, label) in enumerate(tqdm(data, desc="Initializing V/NV")):
            images = glob(f"{sub_dir.as_posix()}/*.jpeg")
            images = natsorted(images)
            self.data[id_] = images 
            self.labels[id_] = label

        self.on_epoch_end()

    def on_epoch_end(self):
        self.keys = list(self.data.keys())
        if self.shuffle:
            rnd.shuffle(self.keys)

    def __data_generation(self, list_IDs_temp):
        X = np.empty((self.batch_size, *self.dim, self.channels))  # (16, 16, 112, 112, 3)
        Y = np.empty((16, 1))
        mean = np.load(str(Meta.npy_file))

        # shape the batch of 3d arrays.    |    output of for loop => (16, 16, 112, 112, 3)
        for i, item in enumerate(list_IDs_temp):
            pack_16_frames = self.data[item]
            frames = self.__read_images(pack_16_frames)
            frames -= mean  # preprocess (subtract mean)
            frames = frames[:, 8: 120, 30: 142, :]  # reshape it to (16, 112, 112, 3)
            X[i] = frames
            Y[i] = self.labels[item]

        return X, keras.utils.to_categorical(Y, num_classes=self.n_classes)

    def __read_images(self, pack_16_frames):
        # images = []
        frames = np.zeros((16, 128, 171, 3), dtype='float')

        for i, id_ in enumerate(pack_16_frames):
            img = cv2.imread(id_)
            img = cv2.resize(img, (171, 128), interpolation=cv2.INTER_CUBIC)
            frames[i, :, :, :] = img

        return frames

    def __len__(self):
        return int(np.floor(len(self.data) / self.batch_size))

    def __getitem__(self, index):

        current = index * self.batch_size
        after = (index + 1) * self.batch_size

        keys = self.keys[current:after]
        batch_x, batch_y = self.__data_generation(keys)

        return batch_x, batch_y


if __name__ == '__main__':
    path = '../data/train'
    batch_size = 16
    shuffle = True
    data_gen = DataGeneratorV1(path=path, batch_size=batch_size, shuffle=shuffle)
    a, b = data_gen[5]
    print(b)

About the code above, I overrided tensorflow.keras.utils.Sequence. It makes a dictionary that keeps a sequence of 16 frames for each key (__init__). Then in each call for batch of data (__getitem__), it reads images from disk (__data_generation and __read_images), reshape it in (16, 112, 112, 3) after preprocess (__data_generation). Then a batch of data is ready (__data_generation).

The problem is that process of validation and training, starves for the data to get ready. This will make the training and validation process super slow.

Now I intend to rewrite the same framework with tf.data which can do the same trick more efficiently, but I am not experienced with it.

I need a data loader that can handle both training and io using parallel processing. During training process, another thread can load the next batches of data to some queue and then stop after m batches of data are ready in queue.

would you help me with your ideas?



from How to use tensorflow.data to generate sequences of frames for C3D networks?

No comments:

Post a Comment