Tuesday 10 November 2020

Why would using the same dataset for training and testing gives different accuracies?

I've been looking into the loss function of the training and the validation dataset, and I keep seeing the validation loss being smaller than the training loss, even when they are the same data set. I'm trying to get some insight as to why this would be the case.

I am training a model in tensorflow to predict some time series data. Consequently, the model creation and preprocessing is as follows:

window_size = 40
batch_size  = 32
forecast_period = 6
model_name = "LSTM"
tf.keras.backend.clear_session()

_seed = 42
tf.random.set_seed(_seed)

def _sub_to_batch(sub):
    return sub.batch(window_size, drop_remainder=True)

def _return_input_output(tensor):
    _input  = tensor[:, :-forecast_period, :]
    _output = tensor[:, forecast_period:, :]
    return _input, _output

def _reshape_tensor(tensor):
    tensor = tf.expand_dims(tensor, axis=-1)
    tensor = tf.transpose(tensor, [1, 0, 2])
    return tensor


# total elements after unbatch(): 3813
train_ts_dataset = tf.data.Dataset.from_tensor_slices(train_ts)\
                            .window(window_size, shift=1)\
                            .flat_map(_sub_to_batch)\
                            .map(_reshape_tensor)\
                            .map(_return_input_output)
#                             .unbatch().shuffle(buffer_size=500, seed=_seed).batch(batch_size)\
#                             .map(_return_input_output)

valid_ts_dataset = tf.data.Dataset.from_tensor_slices(valid_ts)\
                            .window(window_size, shift=1)\
                            .flat_map(_sub_to_batch)\
                            .map(_reshape_tensor)\
                            .unbatch().shuffle(buffer_size=500, seed=_seed).batch(batch_size)\
                            .map(_return_input_output)

def _forecast_mae(y_pred, y_true):
    _y_pred = y_pred[:, -forecast_period:, :]
    _y_true = y_true[:, -forecast_period:, :]
    mae = tf.losses.MAE(_y_true, _y_pred)
    return mae

def _accuracy(y_pred, y_true):
    # print(y_true) => Tensor("sequential/time_distributed/Reshape_1:0", shape=(None, 34, 1), dtype=float32)
    # y_true[-forecast_period:, :]  =>   Tensor("strided_slice_4:0", shape=(None, 34, 1), dtype=float32)
    # y_true[:, -forecast_period:, :] => Tensor("strided_slice_4:0", shape=(None, 6, 1), dtype=float32)

    _y_pred = y_pred[:, -forecast_period:, :]
    _y_pred = tf.reshape(_y_pred, shape=[-1, forecast_period])
    _y_true = y_true[:, -forecast_period:, :]
    _y_true = tf.reshape(_y_true, shape=[-1, forecast_period])

    # MAPE: Tensor("Mean_1:0", shape=(None, 1), dtype=float32)
    MAPE = tf.math.reduce_mean(tf.math.abs((_y_pred - _y_true) / _y_true), axis=1, keepdims=True)

    accuracy = 1 - MAPE
    accuracy = tf.where(accuracy < 0, tf.zeros_like(accuracy), accuracy)
    accuracy = tf.reduce_mean(accuracy)
    return accuracy

model = k.models.Sequential([
    k.layers.Bidirectional(k.layers.LSTM(units=100, return_sequences=True), input_shape=(None, 1)),
    k.layers.Bidirectional(k.layers.LSTM(units=100, return_sequences=True)),
    k.layers.TimeDistributed(k.layers.Dense(1))
])

model_name = []
model_name_symbols = {"bidirectional": "BILSTM_1", "bidirectional_1": "BILSTM_2", "time_distributed": "td"}
for l in model.layers:
    model_name.append(model_name_symbols.get(l.name, l.name))

model_name = "_".join(model_name)
print(model_name)

for i, (x, y) in enumerate(train_ts_dataset):
    print(i, x.numpy().shape, y.numpy().shape)

The output of the shapes of the datasets is as follows:

BILSTM_1_BILSTM_2_td
0 (123, 34, 1) (123, 34, 1)
1 (123, 34, 1) (123, 34, 1)
2 (123, 34, 1) (123, 34, 1)
3 (123, 34, 1) (123, 34, 1)
4 (123, 34, 1) (123, 34, 1)
5 (123, 34, 1) (123, 34, 1)
6 (123, 34, 1) (123, 34, 1)
7 (123, 34, 1) (123, 34, 1)
8 (123, 34, 1) (123, 34, 1)

then:

_datetime = datetime.datetime.now().strftime("%Y%m%d-%H-%M-%S")
_log_dir = os.path.join(".", "logs", "fit7", model_name, _datetime)

tensorboard_cb = k.callbacks.TensorBoard(log_dir=_log_dir)

model.compile(loss="mae", optimizer=tf.optimizers.Adam(learning_rate=0.001), metrics=[_forecast_mae, _accuracy])

history = model.fit(train_ts_dataset, epochs=100, validation_data=train_ts_dataset, callbacks=[tensorboard_cb])

I've been looking into the loss function of the training and the validation dataset, and I keep seeing the validation loss being smaller than the training loss. I could be underfitting. However, I replaced the validation set with the training set as a simple test to monitor the loss and accuracy while training and testing. But I'm still getting validation accuracy being greater than the training one. Below is the accuracy across the training and the validation datasets:

enter image description here

To me its very weird I'm getting a validation accuracy that is greater than the training accuracy though I'm using the same dataset for training and testing. And there is no dropout, no batchNormalization layer etc...

Any hint on what could be the reason for this behavior? That would be much appreciated!!



from Why would using the same dataset for training and testing gives different accuracies?

No comments:

Post a Comment