Skip to content

Commit

Permalink
remove redundant lines
Browse files Browse the repository at this point in the history
  • Loading branch information
ravinkohli committed Oct 26, 2021
1 parent 4da0f38 commit 4b72887
Showing 1 changed file with 0 additions and 56 deletions.
56 changes: 0 additions & 56 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,59 +483,3 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
self.object_dtype_mapping = {column: X[column].dtype for column in X.columns}
self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}")
return X

def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame:
"""
impute missing values before encoding,
remove once sklearn natively supports
it in ordinal encoding. Sklearn issue:
"https://github.com/scikit-learn/scikit-learn/issues/17123)"
Arguments:
X (pd.DataFrame):
data to be interpreted.
Returns:
pd.DataFrame
"""

# To be on the safe side, map always to the same missing
# value per column
if not hasattr(self, 'dict_nancol_to_missing'):
self.dict_missing_value_per_col: typing.Dict[str, typing.Any] = {}

# First make sure that we do not alter the type of the column which cause:
# TypeError: '<' not supported between instances of 'int' and 'str'
# in the encoding
for column in self.enc_columns:
if X[column].isna().any():
if column not in self.dict_missing_value_per_col:
try:
float(X[column].dropna().values[0])
can_cast_as_number = True
except Exception:
can_cast_as_number = False
if can_cast_as_number:
# In this case, we expect to have a number as category
# it might be string, but its value represent a number
missing_value: typing.Union[str, int] = '-1' if isinstance(X[column].dropna().values[0],
str) else -1
else:
missing_value = 'Missing!'

# Make sure this missing value is not seen before
# Do this check for categorical columns
# else modify the value
if hasattr(X[column], 'cat'):
while missing_value in X[column].cat.categories:
if isinstance(missing_value, str):
missing_value += '0'
else:
missing_value += missing_value
self.dict_missing_value_per_col[column] = missing_value

# Convert the frame in place
X[column].cat.add_categories([self.dict_missing_value_per_col[column]],
inplace=True)
X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True)
return X

0 comments on commit 4b72887

Please sign in to comment.