I have an application in which I need to setup a pipeline using tf.data. The data I have is stored in .mat files created in Matlab and contains three variables "s_matrix" which is a 224x224x3 double array, a "frame" which is 1024x1 complex double and finally a numeric label. The pipeline is to be loaded as such so that I can feed the data to the model.fit function. The code I have been using so far to load and process the data is added below but I keep getting several errors of type and unexpected byte errors.
# Define the shape of the input image
input_shape = (224, 224, 3)
# Define the shape of the complex vector after conversion
complex_shape = (1024, 2, 1)
def load_and_preprocess_sample(sample_path):
# Load the sample from the mat file
sample = scipy.io.loadmat(sample_path)
# Extract the matrix, complex vector and label from the sample
matrix = sample['s_matrix']
complex_vector = sample['frame']
label = sample['numeric_label']
# Convert the complex vector to the appropriate shape
#complex_shape = (1024, 2, 1)
#complex_vector = np.reshape(complex_vector, complex_shape)
real = tf.reshape(tf.math.real(complex_vector), [1024, 1])
imag = tf.reshape(tf.math.imag(complex_vector), [1024, 1])
signal_tensor = tf.concat([real, imag], axis=-1)
signal_tensor = tf.reshape(signal_tensor, [1024, 2, 1])
signal = signal_tensor
# Normalize the matrix values between 0 and 1
matrix = matrix / 255.0
# Convert the label to one-hot encoding
label = tf.one_hot(label - 1, num_classes)
return matrix, complex_vector, label
# Define a function to create a dataset from a list of file paths
def create_dataset(file_paths):
# Create a dataset from the file paths
dataset = tf.data.Dataset.from_tensor_slices(file_paths)
# Shuffle the dataset
dataset = dataset.shuffle(buffer_size=len(file_paths))
# Load and preprocess each sample in parallel using CPU cores
dataset = dataset.map(
lambda x: tf.numpy_function(load_and_preprocess_sample, [x], [tf.float32, tf.float32, tf.float32]),
num_parallel_calls=tf.data.AUTOTUNE)
# Batch and prefetch the dataset for performance optimization
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
return dataset
when I try to extract the data for passing to model with below code, it all crashes when the zip function is called.
X_train_a, X_train_b, y_train = zip(*train_dataset)
X_val_a, X_val_b, y_val = zip(*val_dataset)
...
...
...
model.fit([X_train_a, X_train_b], y_train, batch_size=5, epochs=5, validation_data=([X_test_a, X_test_b], y_test))
the error output is shared below:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
Cell In[6], line 84
78 val_dataset = create_dataset(val_file_paths)
82 #Y: attempting segragation of inputs
83 # Convert the dataset into separate inputs
---> 84 X_train_a, X_train_b, y_train = zip(*train_dataset)
85 X_val_a, X_val_b, y_val = zip(*val_dataset)
87 # Define your model architecture here...
File ~\miniconda3\envs\tf2\lib\site-packages\tensorflow\python\data\ops\iterator_ops.py:766, in OwnedIterator.__next__(self)
764 def __next__(self):
765 try:
--> 766 return self._next_internal()
767 except errors.OutOfRangeError:
768 raise StopIteration
File ~\miniconda3\envs\tf2\lib\site-packages\tensorflow\python\data\ops\iterator_ops.py:749, in OwnedIterator._next_internal(self)
746 # TODO(b/77291417): This runs in sync mode as iterators use an error status
747 # to communicate that there is no more data to iterate over.
748 with context.execution_mode(context.SYNC):
--> 749 ret = gen_dataset_ops.iterator_get_next(
750 self._iterator_resource,
751 output_types=self._flat_output_types,
752 output_shapes=self._flat_output_shapes)
754 try:
755 # Fast path for the case `self._structure` is not a nested structure.
756 return self._element_spec._from_compatible_tensor_list(ret) # pylint: disable=protected-access
File ~\miniconda3\envs\tf2\lib\site-packages\tensorflow\python\ops\gen_dataset_ops.py:3016, in iterator_get_next(iterator, output_types, output_shapes, name)
3014 return _result
3015 except _core._NotOkStatusException as e:
-> 3016 _ops.raise_from_not_ok_status(e, name)
3017 except _core._FallbackException:
3018 pass
File ~\miniconda3\envs\tf2\lib\site-packages\tensorflow\python\framework\ops.py:7209, in raise_from_not_ok_status(e, name)
7207 def raise_from_not_ok_status(e, name):
7208 e.message += (" name: " + name if name is not None else "")
-> 7209 raise core._status_to_exception(e) from None
InvalidArgumentError: 0-th value returned by pyfunc_0 is double, but expects float
[[]] [Op:IteratorGetNext]
I am definitely doing something wrong, what is the solution?
from Issue in setting up a tf.data pipeline for training TensorFlow model
No comments:
Post a Comment