From f0eb920f85c0b9a518c8cc9bbc9388b3436f51da Mon Sep 17 00:00:00 2001 From: hetianlin Date: Thu, 10 Jun 2021 16:31:35 +0200 Subject: [PATCH 1/2] original commit from issue 772 --- imblearn/over_sampling/_smote/base.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) mode change 100644 => 100755 imblearn/over_sampling/_smote/base.py diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py old mode 100644 new mode 100755 index cd7e02e3e..73d7a78d9 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -102,7 +102,7 @@ def _make_samples( X_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps) y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype) - return X_new, y_new + return X_new, y_new, rows, cols def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps): r"""Generate a synthetic sample. @@ -299,6 +299,9 @@ def _fit_resample(self, X, y): X_resampled = [X.copy()] y_resampled = [y.copy()] + self.real_indices=[i for i in range(len(y))] + self.which_neighbors=[0]*len(y) + for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue @@ -307,19 +310,38 @@ def _fit_resample(self, X, y): self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] - X_new, y_new = self._make_samples( + X_new, y_new, rows, cols = self._make_samples( X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0 ) X_resampled.append(X_new) y_resampled.append(y_new) + self.real_indices.append(target_class_indices[rows]) + self.which_neighbors.append(cols) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled, format=X.format) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) + self.real_indices=np.hstack(self.real_indices) + self.which_neighbors=np.hstack(self.which_neighbors) return X_resampled, y_resampled + + def sample_indices(self,get_which_neighbors=False): + """return indices + - for real sample, return its own index + - for synthetic sample, return the index of its "mother" real sample + ----------- + if get_which_neighbors=True: + also return which nearest neighbor is used + (for real sample, it is 0) + """ + + if get_which_neighbors is True: + return [(i,j) for i, j in zip(self.real_indices, self.which_neighbors)] + else: + return self.real_indices @Substitution( From bcd8ba9f9b914727dbf984bee957ed1b17a76bb0 Mon Sep 17 00:00:00 2001 From: hetianlin Date: Fri, 18 Jun 2021 14:24:59 +0200 Subject: [PATCH 2/2] PEP8 style check --- imblearn/over_sampling/_smote/base.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 73d7a78d9..4cdc96722 100755 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -299,8 +299,8 @@ def _fit_resample(self, X, y): X_resampled = [X.copy()] y_resampled = [y.copy()] - self.real_indices=[i for i in range(len(y))] - self.which_neighbors=[0]*len(y) + self.real_indices = [i for i in range(len(y))] + self.which_neighbors = [0]*len(y) for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: @@ -323,24 +323,25 @@ def _fit_resample(self, X, y): else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) - self.real_indices=np.hstack(self.real_indices) - self.which_neighbors=np.hstack(self.which_neighbors) + self.real_indices = np.hstack(self.real_indices) + self.which_neighbors = np.hstack(self.which_neighbors) return X_resampled, y_resampled - def sample_indices(self,get_which_neighbors=False): + def sample_indices(self, get_which_neighbors=False): """return indices - for real sample, return its own index - - for synthetic sample, return the index of its "mother" real sample + - for synthetic sample, return the index of its "mother" real sample + + Parameters ----------- - if get_which_neighbors=True: - also return which nearest neighbor is used - (for real sample, it is 0) + get_which_neighbors: if ==True returns which nearest neighbor is used + For samples that are not generated, returns 0 """ if get_which_neighbors is True: - return [(i,j) for i, j in zip(self.real_indices, self.which_neighbors)] - else: + return [(i, j) for i, j in zip(self.real_indices, self.which_neighbors)] + else: return self.real_indices @@ -540,7 +541,7 @@ def _fit_resample(self, X, y): X_resampled, y_resampled = super()._fit_resample(X_encoded, y) # reverse the encoding of the categorical features - X_res_cat = X_resampled[:, self.continuous_features_.size :] + X_res_cat = X_resampled[:, self.continuous_features_.size:] X_res_cat.data = np.ones_like(X_res_cat.data) X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat) @@ -595,7 +596,7 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps): # create non-null entry based on the encoded of OHE if math.isclose(self.median_std_, 0): nn_data[ - :, self.continuous_features_.size : + :, self.continuous_features_.size: ] = self._X_categorical_minority_encoded all_neighbors = nn_data[nn_num[rows]]