This is a custom support vectorbased data undersampler answer from my previous question.
The main idea is to undersample the majority class in an informed way, by fitting an SVC to the data, find the support vectors, and then undersample the majority class based on the distances to these support vectors.
Code:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import resample
from sklearn.svm import SVC
import numpy as np
from sklearn.multiclass import OneVsOneClassifier
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
class DataUndersampler(BaseEstimator, TransformerMixin):
def __init__(self, random_state=None):
self.random_state = random_state
self.svc = SVC(kernel='linear')
def fit(self, X, y):
# Fit SVC to data
self.svc.fit(X, y)
return self
def transform(self, X, y):
# Get support vectors
support_vectors = self.svc.support_vectors_
# Get indices of support vectors
support_vector_indices = self.svc.support_
# Separate majority and minority classes
majority_class = y.value_counts().idxmax()
minority_class = y.value_counts().idxmin()
X_majority = X[y == majority_class]
y_majority = y[y == majority_class]
X_minority = X[y == minority_class]
y_minority = y[y == minority_class]
# Calculate distances of majority class samples to nearest support vector
distances = np.min(np.linalg.norm(X_majority.values[:, np.newaxis] - support_vectors, axis=2), axis=1)
# Sort the majority class samples by distance and take only as many as there are in minority class
sorted_indices = np.argsort(distances)
indices_to_keep = sorted_indices[:len(y_minority)]
# Combine the undersampled majority class with the minority class
X_resampled = pd.concat([X_majority.iloc[indices_to_keep], X_minority])
y_resampled = pd.concat([y_majority.iloc[indices_to_keep], y_minority])
return X_resampled, y_resampled
MWE:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10_000, n_classes=5, weights=[22.6, 3.7, 16.4, 51.9],
n_informative=4)
rf_clf = model = RandomForestClassifier()
resampler = DataUndersampler(random_state=234)
pipeline = Pipeline([('sampler', resampler), ('clf', rf_clf)])
classifier = OneVsOneClassifier(estimator=pipeline)
classifier.fit(X, y)
Produces the error:
----> 7 classifier.fit(X, y)
18 frames
/usr/local/lib/python3.10/dist-packages/sklearn/utils/_set_output.py in wrapped(self, X, *args, **kwargs)
138 @wraps(f)
139 def wrapped(self, X, *args, **kwargs):
--> 140 data_to_wrap = f(self, X, *args, **kwargs)
141 if isinstance(data_to_wrap, tuple):
142 # only wrap the first output for cross decomposition
TypeError: DataUndersampler.transform() missing 1 required positional argument: 'y'
from What is responsible for this TypeError: DataUndersampler.transform() missing 1 required positional argument: 'y'?
No comments:
Post a Comment