I have this df
whose features I'd like to train, test and validate, using cross-validation:
RangeIndex: 370 entries, 0 to 369
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 round 370 non-null int64 <---- will be dropped
1 home_team_goal 370 non-null int64 <---- will be dropped
2 away_team_goal 370 non-null int64 <---- will be dropped
3 home_best_attack 370 non-null float64
4 home_best_defense 370 non-null float64
5 home_avg_attack 370 non-null float64
6 home_avg_defense 370 non-null float64
7 home_std_attack 370 non-null float64
8 home_std_defense 370 non-null float64
9 gk_home_player_1 370 non-null float64
10 away_avg_attack 370 non-null float64
11 away_avg_defense 370 non-null float64
12 away_std_attack 370 non-null float64
13 away_std_defense 370 non-null float64
14 away_best_attack 370 non-null float64
15 away_best_defense 370 non-null float64
16 gk_away_player_1 370 non-null float64
dtypes: float64(14), int64(3)
Dataset
My dataset, however, must include some added topological features (which captures diagrams and geometrical relationships between the original set of features above).
This is how I need my dataset to end up like, in a pandas
dataframe:
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 home_best_attack 2565 non-null float64
1 home_best_defense 2565 non-null float64
2 home_avg_attack 2565 non-null float64
3 home_avg_defense 2565 non-null float64
4 home_std_attack 2565 non-null float64
5 home_std_defense 2565 non-null float64
6 gk_home_player_1 2565 non-null float64
7 away_avg_attack 2565 non-null float64
8 away_avg_defense 2565 non-null float64
9 away_std_attack 2565 non-null float64
10 away_std_defense 2565 non-null float64
11 away_best_attack 2565 non-null float64
12 away_best_defense 2565 non-null float64
13 gk_away_player_1 2565 non-null float64
14 bottleneck_metric 2565 non-null float64 <---- will be added
15 wasserstein_metric 2565 non-null float64 <---- will be added
16 landscape_metric 2565 non-null float64 <---- will be added
17 betti_metric 2565 non-null float64 <---- will be added
18 heat_metric 2565 non-null float64 <---- will be added
19 label 2565 non-null float64 <---- will be added
Test Feature Extraction
I have the following methods for extracting those added features from the original dataframe:
def extract_topological_features(diagrams):
metrics = ['bottleneck', 'wasserstein', 'landscape', 'betti', 'heat']
new_features = []
for metric in metrics:
amplitude = Amplitude(metric=metric)
new_features.append(amplitude.fit_transform(diagrams))
new_features = np.concatenate(new_features, axis=1)
return new_features
def extract_features_for_fantasy_prediction(x_train, y_train, x_test, y_test, pipeline):
shift = 10
top_features = []
# run as main
all_x_train = x_train[:, :14]
all_y_train = y_train
for i in tqdm(range(0, len(x_test), shift)):
#
print(range(0, len(x_test), shift))
if i+shift > len(x_test):
shift = len(x_test) - i
batch = np.concatenate([all_x_train, x_test[i: i + shift]])
batch_y = np.concatenate([all_y_train, y_test[i: i + shift].reshape((-1,))])
diagrams_batch, _ = pipeline.fit_transform_resample(batch, batch_y)
new_features_batch = extract_topological_features(diagrams_batch[-shift:])
top_features.append(new_features_batch)
all_x_train = np.concatenate([all_x_train, batch[-shift:]])
all_y_train = np.concatenate([all_y_train, batch_y[-shift:]])
final_x_test = np.concatenate([x_test, np.concatenate(top_features, axis=0)], axis=1)
return final_x_test
Cross Validation
and this is my code for doing so:
def cross_validate(self, full_x, full_y, splitting_dates):
train_split_date = splitting_dates[0]
val_split_date = splitting_dates[1]
end_date = splitting_dates[2]
train_x = full_x[(full_x['round'] > train_split_date) | (full_x['round'] <= end_date)]
train_y = full_y[(full_x['round'] > train_split_date) | (full_x['round'] <= end_date)]
val_x = full_x[(full_x['round'] >= train_split_date) & (full_x['round'] < val_split_date)]
val_y = full_y[(full_x['round'] >= train_split_date) & (full_x['round'] < val_split_date)]
test_x = full_x[(full_x['round'] >= val_split_date) & (full_x['round'] < end_date)]
test_y = full_y[(full_x['round'] >= val_split_date) & (full_x['round'] < end_date)]
train_x.pop("round")
val_x.pop("round")
test_x.pop("round")
train_x = train_x.values
train_y = train_y.values
val_x = val_x.values
val_y = val_y.values
test_x = test_x.values
test_y = test_y.values
print("START VALIDATING MODEL")
models_cv = self._validate_k_fold_model(train_x, train_y, val_x, val_y)
best_model_params = best_combination(models_cv)
best_model_params.pop("score")
best_model = RandomForestClassifier(**best_model_params)
best_model.fit(train_x, train_y)
score = best_model.score(test_x, test_y)
print(f'score no_top {score}')
print(f'best model parameters no_top {best_model_params}')
print("START VALIDATING PARAMS")
topo_cv = self._validate_k_fold_top(best_model, train_x, train_y, val_x, val_y)
best_topo = best_combination(topo_cv)
best_topo.pop("score")
best_topo_pipeline_list = [('extract_subspaces', SubSpaceExtraction(**best_topo)),
('compute_diagrams', VietorisRipsPersistence(n_jobs=-1))]
best_topo_pipeline = Pipeline(best_topo_pipeline_list)
train_x_for_test = np.concatenate([train_x, val_x], axis=0)
train_y_for_test = np.concatenate([train_y, val_y], axis=0)
diagrams_train, _ = best_topo_pipeline.fit_transform_resample(train_x_for_test, train_y_for_test)
print("EXTRACTING TOPOLOGICAL FEATURES TRAIN")
top_features_train = extract_topological_features(diagrams_train)
x_train_model = np.concatenate([train_x_for_test, top_features_train], axis=1)
best_model.fit(x_train_model, train_y_for_test)
print("EXTRACTING TOPOLOGICAL FEATURES TEST")
x_test_model = extract_features_for_fantasy_prediction(x_train_model, train_y_for_test,
test_x, test_y, best_topo_pipeline)
score_top = best_model.score(x_test_model, test_y)
val_x_with_topo = extract_features_for_fantasy_prediction(train_x, train_y, val_x, val_y, best_topo_pipeline)
print('START VALIDATING MODEL WITH OPTIMAL TOPOLOGY')
model_config_with_topo = self._validate_k_fold_model(x_train_model[:train_x.shape[0]], train_y, val_x_with_topo, val_y)
best_model_config_with_topo = best_combination(model_config_with_topo)
best_model_config_with_topo.pop('score')
best_model_with_topo = RandomForestClassifier(**best_model_config_with_topo)
best_model_with_topo.fit(x_train_model, train_y_for_test)
score_best_topo_and_model = best_model_with_topo.score(x_test_model, test_y)
print(f'score best model and topo_feat {score_best_topo_and_model}')
return best_model_params, best_topo, best_model_config_with_topo, score, score_top, score_best_topo_and_model
QUESTION:
Running the following code...
y = compute_match_result(df)
df.pop('home_team_goal')
df.pop('away_team_goal')
cv = CrossValidation(k_mins=k_mins, k_maxs=k_maxs, dist_percentages=distances, **model_params)
cv_output = cv.cross_validate(df, y, (train_split_date, val_split_date, end_date))
...and since most of features are being computed as arrays
, at precisely which point of cross_validation()
above (and HOW) can I add the topological features to the original df
, ending up with the desired dataset (datarame) above?
Note: full code can be found here.
For a dataset sample:
pip install openml
Then at the top of the script:
from openml.datasets import get_dataset
and:
x_y_df = get_dataset(42188).get_data(dataset_format='dataframe')[0]
Hope this helps. Sorry for the heavy code.
from Extract topological features from dataframe an add them to a new, expanded dataframe
No comments:
Post a Comment