from astartes import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
sampler = 'kennard_stone',
)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File <timed exec>:4
File ~/mambaforge/lib/python3.11/site-packages/astartes/main.py:114, in train_test_split(X, y, labels, train_size, test_size, sampler, random_state, hopts, return_indices)
87 def train_test_split(
88 X: np.array,
89 y: np.array = None,
(...)
96 return_indices: bool = False,
97 ):
98 """Deterministic train_test_splitting of arbitrary arrays.
99
100 Args:
(...)
112 np.array: X, y, and labels train/val/test data, or indices.
113 """
--> 114 return train_val_test_split(
115 X,
116 y,
117 labels,
118 train_size,
119 0,
120 test_size,
121 sampler,
122 random_state,
123 hopts,
124 return_indices,
125 )
File ~/mambaforge/lib/python3.11/site-packages/astartes/main.py:69, in train_val_test_split(X, y, labels, train_size, val_size, test_size, sampler, random_state, hopts, return_indices)
64 sampler_instance = sampler_factory.get_sampler(X, y, labels, hopts)
66 if sampler in (*IMPLEMENTED_INTERPOLATION_SAMPLERS, "time_based"):
67 # time_based does extrapolation but does not support random_state
68 # because it always sorts in time order
---> 69 return _interpolative_sampling(
70 sampler_instance,
71 test_size,
72 val_size,
73 train_size,
74 return_indices,
75 )
76 else:
77 return _extrapolative_sampling(
78 sampler_instance,
79 test_size,
(...)
83 random_state,
84 )
File ~/mambaforge/lib/python3.11/site-packages/astartes/main.py:228, in _interpolative_sampling(sampler_instance, test_size, val_size, train_size, return_indices)
223 test_idxs = sampler_instance.get_sample_idxs(n_test_samples)
225 _check_actual_split(
226 train_idxs, val_idxs, test_idxs, train_size, val_size, test_size
227 )
--> 228 return _return_helper(
229 sampler_instance, train_idxs, val_idxs, test_idxs, return_indices
230 )
File ~/mambaforge/lib/python3.11/site-packages/astartes/main.py:253, in _return_helper(sampler_instance, train_idxs, val_idxs, test_idxs, return_indices)
240 """Convenience function to return the requested arrays appropriately.
241
242 Args:
(...)
250 np.array: Either many arrays or indices in arrays.
251 """
252 out = []
--> 253 X_train = sampler_instance.X[train_idxs]
254 out.append(X_train)
255 if len(val_idxs):
File ~/mambaforge/lib/python3.11/site-packages/pandas/core/frame.py:3767, in DataFrame.__getitem__(self, key)
3765 if is_iterator(key):
3766 key = list(key)
-> 3767 indexer = self.columns._get_indexer_strict(key, "columns")[1]
3769 # take() does not accept boolean indexers
3770 if getattr(indexer, "dtype", None) == bool:
File ~/mambaforge/lib/python3.11/site-packages/pandas/core/indexes/base.py:5876, in Index._get_indexer_strict(self, key, axis_name)
5873 else:
5874 keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 5876 self._raise_if_missing(keyarr, indexer, axis_name)
5878 keyarr = self.take(indexer)
5879 if isinstance(key, Index):
5880 # GH 42790 - Preserve name from an Index
File ~/mambaforge/lib/python3.11/site-packages/pandas/core/indexes/base.py:5935, in Index._raise_if_missing(self, key, indexer, axis_name)
5933 if use_interval_msg:
5934 key = list(key)
-> 5935 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
5937 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
5938 raise KeyError(f"{not_found} not in index")
KeyError: "None of [Index([ 0, 106, 768, 1857, 1136, 925, 1276, 121, 1205, 1278,\n ...\n 609, 893, 1738, 1661, 1590, 1630, 302, 1768, 1876, 952],\n dtype='int64', length=1525)] are in the [columns]"
The split goes well without error.
If applicable, add screenshots to help explain your problem.
Add any other context about the problem here that might help developers solve this.