Wednesday, 21 June 2023

Issue in setting up a tf.data pipeline for training TensorFlow model

I have an application in which I need to setup a pipeline using tf.data. The data I have is stored in .mat files created in Matlab and contains three variables "s_matrix" which is a 224x224x3 double array, a "frame" which is 1024x1 complex double and finally a numeric label. The pipeline is to be loaded as such so that I can feed the data to the model.fit function. The code I have been using so far to load and process the data is added below but I keep getting several errors of type and unexpected byte errors.

# Define the shape of the input image
input_shape = (224, 224, 3)

# Define the shape of the complex vector after conversion
complex_shape = (1024, 2, 1)


def load_and_preprocess_sample(sample_path):

    # Load the sample from the mat file
    sample = scipy.io.loadmat(sample_path)
    # Extract the matrix, complex vector and label from the sample
    matrix = sample['s_matrix']
    complex_vector = sample['frame']
    label = sample['numeric_label']

    # Convert the complex vector to the appropriate shape
    #complex_shape = (1024, 2, 1)
    #complex_vector = np.reshape(complex_vector, complex_shape)
    
    real = tf.reshape(tf.math.real(complex_vector), [1024, 1])
    imag = tf.reshape(tf.math.imag(complex_vector), [1024, 1])
    signal_tensor = tf.concat([real, imag], axis=-1)
    signal_tensor = tf.reshape(signal_tensor, [1024, 2, 1])
    signal = signal_tensor
    # Normalize the matrix values between 0 and 1
    matrix = matrix / 255.0

    # Convert the label to one-hot encoding
    label = tf.one_hot(label - 1, num_classes)

   
    return matrix, complex_vector, label

# Define a function to create a dataset from a list of file paths
def create_dataset(file_paths):
    # Create a dataset from the file paths
    dataset = tf.data.Dataset.from_tensor_slices(file_paths)

    # Shuffle the dataset
    dataset = dataset.shuffle(buffer_size=len(file_paths))

    # Load and preprocess each sample in parallel using CPU cores
    dataset = dataset.map(
        lambda x: tf.numpy_function(load_and_preprocess_sample, [x], [tf.float32, tf.float32, tf.float32]),
        num_parallel_calls=tf.data.AUTOTUNE)

    # Batch and prefetch the dataset for performance optimization
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

    return dataset

when I try to extract the data for passing to model with below code, it all crashes when the zip function is called.

X_train_a, X_train_b, y_train = zip(*train_dataset)
X_val_a, X_val_b, y_val = zip(*val_dataset)
...
...
...
model.fit([X_train_a, X_train_b], y_train, batch_size=5, epochs=5, validation_data=([X_test_a, X_test_b], y_test))

the error output is shared below:

---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
Cell In[6], line 84
     78 val_dataset = create_dataset(val_file_paths)
     82 #Y: attempting segragation of inputs
     83 # Convert the dataset into separate inputs
---> 84 X_train_a, X_train_b, y_train = zip(*train_dataset)
     85 X_val_a, X_val_b, y_val = zip(*val_dataset)
     87 # Define your model architecture here...

File ~\miniconda3\envs\tf2\lib\site-packages\tensorflow\python\data\ops\iterator_ops.py:766, in OwnedIterator.__next__(self)
    764 def __next__(self):
    765   try:
--> 766     return self._next_internal()
    767   except errors.OutOfRangeError:
    768     raise StopIteration

File ~\miniconda3\envs\tf2\lib\site-packages\tensorflow\python\data\ops\iterator_ops.py:749, in OwnedIterator._next_internal(self)
    746 # TODO(b/77291417): This runs in sync mode as iterators use an error status
    747 # to communicate that there is no more data to iterate over.
    748 with context.execution_mode(context.SYNC):
--> 749   ret = gen_dataset_ops.iterator_get_next(
    750       self._iterator_resource,
    751       output_types=self._flat_output_types,
    752       output_shapes=self._flat_output_shapes)
    754   try:
    755     # Fast path for the case `self._structure` is not a nested structure.
    756     return self._element_spec._from_compatible_tensor_list(ret)  # pylint: disable=protected-access

File ~\miniconda3\envs\tf2\lib\site-packages\tensorflow\python\ops\gen_dataset_ops.py:3016, in iterator_get_next(iterator, output_types, output_shapes, name)
   3014   return _result
   3015 except _core._NotOkStatusException as e:
-> 3016   _ops.raise_from_not_ok_status(e, name)
   3017 except _core._FallbackException:
   3018   pass

File ~\miniconda3\envs\tf2\lib\site-packages\tensorflow\python\framework\ops.py:7209, in raise_from_not_ok_status(e, name)
   7207 def raise_from_not_ok_status(e, name):
   7208   e.message += (" name: " + name if name is not None else "")
-> 7209   raise core._status_to_exception(e) from None

InvalidArgumentError:  0-th value returned by pyfunc_0 is double, but expects float
     [[]] [Op:IteratorGetNext]

I am definitely doing something wrong, what is the solution?



from Issue in setting up a tf.data pipeline for training TensorFlow model

No comments:

Post a Comment