New to ARIMA and attempting to model a dataset in Python using auto ARIMA. I'm using auto-ARIMA as I believe it will be better at defining the values of p, d and q however the results are poor and I need some guidance. Please see my reproducible attempts below
Attempt as follows:
# DEPENDENCIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pmdarima as pm
from pmdarima.model_selection import train_test_split
from statsmodels.tsa.stattools import adfuller
from pmdarima.arima import ADFTest
from sklearn.metrics import r2_score
# CREATE DATA
data_plot = pd.DataFrame({'date':['2013-11' '2013-12' '2014-01' '2014-02' '2014-03' '2014-04' '2014-05' '2014-06' '2014-07' '2014-08' '2014-09' '2014-10' '2014-11' '2014-12' '2015-01' '2015-02' '2015-03' '2015-04' '2015-05' '2015-06' '2015-07' '2015-08' '2015-09' '2015-10' '2015-11' '2015-12' '2016-01' '2016-02' '2016-03' '2016-04' '2016-05' '2016-06' '2016-07' '2016-08' '2016-09' '2016-10' '2016-11' '2016-12' '2017-01' '2017-02' '2017-03' '2017-04' '2017-05' '2017-06' '2017-07' '2017-08' '2017-09' '2017-10' '2017-11' '2017-12' '2018-01' '2018-02' '2018-03' '2018-04' '2018-05' '2018-06' '2018-07' '2018-08' '2018-09' '2018-10' '2018-11' '2018-12' '2019-01' '2019-02' '2019-03' '2019-04' '2019-05' '2019-06' '2019-07' '2019-08' '2019-09' '2019-10' '2019-11' '2019-12' '2020-01' '2020-02' '2020-03' '2020-04' '2020-05' '2020-06' '2020-07' '2020-08' '2020-09' '2020-10' '2020-11' '2020-12' '2021-01' '2021-02' '2021-03' '2021-04' '2021-05' '2021-06' '2021-07' '2021-08' '2021-09' '2021-10' '2021-11' '2021-12' '2022-01' '2022-02' '2022-03' '2022-04' '2022-05' '2022-06' '2022-07' '2022-08' '2022-09' '2022-10' '2022-11' '2022-12' '2023-01' '2023-02' '2023-03' '2023-04'],
'value':[346, 21075, 82358, 91052, 95376, 100520, 107702, 116805, 124176, 136239, 140815, 159714, 172733, 197447, 297687, 288239, 281170, 277214, 278936, 279071, 288874, 293893, 299309, 319841, 333347, 371546, 488903, 468856, 460260, 452446, 448224, 441182, 438710, 437962, 441128, 455476, 462871, 517929, 627044, 601801, 579134, 576604, 554526, 547522, 559668, 561200, 564239, 583039, 595483, 656733, 750469, 719269, 720623, 712774, 699002, 692017, 695036, 709596, 720238, 717761, 719457, 763163, 825152, 786148, 765526, 752169, 740352, 724386, 708216, 709802, 691991, 698436, 697621, 736228, 779327, 752493, 795272, 780834, 741754, 729164, 713566, 676471, 646674, 656769, 651333, 664199, 644717, 604296, 591136, 571178, 556116, 523501, 522527, 520842, 495804, 504137, 483927, 516234, 491449, 461908, 441156, 437471, 416214, 395315, 390058, 380449, 369834, 373706, 361396, 381941, 358167, 335394, 325213, 312705]})
# SET INDEX
data_plot['date_index'] = pd.to_datetime(data_plot['date']
data_plot.set_index('date_index', inplace=True)
# CREATE ARIMA DATASET
arima_data = data_plot[['value']]
arima_data
# PLOT DATA
arima_data['value'].plot(figsize=(7,4))
The above steps result in a dataset that should look like this.
# Dicky Fuller test for stationarity
adf_test = ADFTest(alpha = 0.05)
adf_test.should_diff(arima_data)
Result = 0.9867 indicating non-stationary data which should be handled by appropriate over of differencing later in auto arima process.
# Assign training and test subsets - 80:20 split
print('Dataset dimensions;', arima_data.shape)
train_data = arima_data[:-24]
test_data = arima_data[-24:]
print('Training data dimension:', train_data.shape, round((len(train_data)/len(arima_data)*100),2),'% of dataset')
print('Test data dimension:', test_data.shape, round((len(train_data)/len(arima_data)*100),2),'% of dataset')
# Plot training & test data
plt.plot(train_data)
plt.plot(test_data)
# Run auto arima
arima_model = auto_arima(train_data, start_p=0, d=1, start_q=0,
max_p=5, max_d=5, max_q=5,
start_P=0, D=1, start_Q=0, max_P=5, max_D=5,
max_Q=5, m=12, seasonal=True,
stationary=False,
error_action='warn', trace=True,
suppress_warnings=True, stepwise=True,
random_state=20, n_fits=50)
print(arima_model.aic())
Output suggests best model is 'ARIMA(1,1,1)(0,1,0)[12]'
with AIC 1725.35484
#Store predicted values and view resultant df
prediction = pd.DataFrame(arima_model.predict(n_periods=25), index=test_data.index)
prediction.columns = ['predicted_value']
prediction
# Plot prediction against test and training trends
plt.figure(figsize=(7,4))
plt.plot(train_data, label="Training")
plt.plot(test_data, label="Test")
plt.plot(prediction, label="Predicted")
plt.legend(loc='upper right')
plt.show()
# Finding r2 model score
test_data['predicted_value'] = prediction
r2_score(test_data['value'], test_data['predicted_value'])
Result: -6.985
from Auto ARIMA in Python results in poor fitting prediction of trend
No comments:
Post a Comment