# clean.py – Clean module of Datawaza
#
# Datawaza Copyright (C) 2024 Jim Beno
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details:
# https://github.com/jbeno/datawaza/blob/main/LICENSE
"""
This module provides tools to clean data in preparation for modeling.
It contains functions to convert data types, convert unites of measurement,
convert time values, reduce multicollinearity, and split out the outliers from
a dataset.
Functions:
- :func:`~datawaza.clean.convert_data_values` - Convert mixed data values (ex: GB, MB, KB) to a common unit of measurement.
- :func:`~datawaza.clean.convert_dtypes` - Convert specified columns in a DataFrame to the desired data type.
- :func:`~datawaza.clean.convert_time_values` - Convert time values in specified columns of a DataFrame to a target format.
- :func:`~datawaza.clean.reduce_multicollinearity` - Reduce multicollinearity in a DataFrame by removing highly correlated features.
- :func:`~datawaza.clean.split_outliers` - Split a DataFrame into two based on the presence of outliers.
"""
# Metadata
__author__ = "Jim Beno"
__email__ = "jim@jimbeno.net"
__version__ = "0.1.3"
__license__ = "GNU GPLv3"
# Standard library imports
import re
# Data manipulation and analysis
import numpy as np
import pandas as pd
from pandas import DataFrame
# Typing imports
from typing import Optional, Union, Tuple, List, Dict, Any
# Functions
[docs]
def convert_data_values(
df: pd.DataFrame,
cols: List[str],
target_unit: str = 'MB',
show_results: bool = False,
inplace: bool = False,
decimal: int = 4,
conversion_dict: Optional[Dict[str, int]] = None
) -> Optional[pd.DataFrame]:
"""
Convert mixed data values (ex: GB, MB, KB) to a common unit of measurement.
This function converts values in the specified columns of the input DataFrame
to the desired target unit. If `inplace` is set to True, the conversion is done
in place, modifying the original DataFrame. If `inplace` is False (default), a
new DataFrame with the converted values is returned. The string suffix is
dropped and the column is converted to a float. It handles inconsistent suffix
strings, with or without spaces after the numbers (ex: '10GB', '10 Gb'). A
variety of spelling options are supported (ex: 'GB', 'Gigabytes'), but you
can pass a custom dictionary as `conversion_dict` if desired. To display a
summary of the changes made, set `show_results` to True.
Use this to clean up messy data that has a variety of units of measurement
appended as text strings to the numeric values. The result will be columns
with a common unit of measurement as floats (with no text suffixes).
Parameters
----------
df : pd.DataFrame
The DataFrame containing the columns to be converted.
cols : List[str]
List of column names in `df` to apply the conversion.
target_unit : str, optional
The target unit for the conversion. Default is 'MB'.
Possible values are: 'B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB'.
show_results : bool, optional
If set to True, prints the before-and-after conversion values for each cell.
Default is False.
inplace : bool, optional
If set to True, the conversion is done in place, modifying the original DataFrame.
If False (default), a new DataFrame with the converted values is returned.
decimal : int, optional
The number of decimal places to show when `show_results` is set to True.
Default is 4.
conversion_dict : Dict[str, int], optional
A custom dictionary mapping units to their corresponding powers of 2.
If provided, it will override the default `unit_conversion` dictionary.
Default is None.
Returns
-------
pd.DataFrame or None
If `inplace` is False (default), returns a new DataFrame with the specified
columns converted to the target unit. If `inplace` is True, returns None as
the original DataFrame is modified in place.
Examples
--------
Prepare some sloppy data for the examples:
>>> df = pd.DataFrame({
... 'A': ['5 gb', '500 kb', '2tb'],
... 'B': ['3GB', '0.5 MB', '200KB'],
... 'C': ['10 Gigabytes', '250 kilo', '1 PB']
... })
>>> cols = ['A', 'B', 'C']
Example 1: Convert values in specified columns to GB and assign to a new df:
>>> df_converted = convert_data_values(df, cols, target_unit='GB')
>>> df_converted
A B C
0 5.000000 3.000000 1.000000e+01
1 0.000477 0.000488 2.384186e-04
2 2048.000000 0.000191 1.048576e+06
Example 2: Convert data values to MB in place, modifying the existing df,
and show a summary of the changes:
>>> convert_data_values(df, cols, target_unit='MB', inplace=True,
... show_results=True, decimal=8)
Original: 5 gb -> Converted: 5120.00000000 MB
Original: 500 kb -> Converted: 0.48828125 MB
Original: 2tb -> Converted: 2097152.00000000 MB
Original: 3GB -> Converted: 3072.00000000 MB
Original: 0.5 MB -> Converted: 0.50000000 MB
Original: 200KB -> Converted: 0.19531250 MB
Original: 10 Gigabytes -> Converted: 10240.00000000 MB
Original: 250 kilo -> Converted: 0.24414062 MB
Original: 1 PB -> Converted: 1073741824.00000000 MB
>>> df
A B C
0 5.120000e+03 3072.000000 1.024000e+04
1 4.882812e-01 0.500000 2.441406e-01
2 2.097152e+06 0.195312 1.073742e+09
"""
# Default conversion factors based on powers of 2
default_unit_conversion = {
'B': 0,
'BYTE': 0,
'BYTES': 0,
'KB': 10,
'KILOBYTE': 10,
'KILOBYTES': 10,
'KILO': 10,
'MB': 20,
'MEGABYTE': 20,
'MEGABYTES': 20,
'MEGA': 20,
'GB': 30,
'GIGABYTE': 30,
'GIGABYTES': 30,
'GIGA': 30,
'TB': 40,
'TERABYTE': 40,
'TERABYTES': 40,
'TERA': 40,
'PB': 50,
'PETABYTE': 50,
'PETABYTES': 50,
'PETA': 50,
'EB': 60,
'EXABYTE': 60,
'EXABYTES': 60,
'EXA': 60
}
# Use the provided conversion_dict if passed as a parameter
unit_conversion = conversion_dict if conversion_dict is not None else default_unit_conversion
# Convert target_unit to uppercase for case-insensitive comparison
target_unit = target_unit.upper()
# Copy dataframe if not modifying in place
if not inplace:
df = df.copy()
# Iterate through the columns
for col in cols:
# Function to convert a data value
def convert_value(value):
if pd.isna(value): # Handle NaNs
if show_results:
print(f"Original: NaN -> Converted: NaN")
return np.nan
# RegEx handles both with spaces or without, ex: '10 GB' and '10GB'
match = re.match(r'(\d+\.?\d*)\s?([A-Za-z]+)', value, re.IGNORECASE)
if not match:
raise ValueError(f"Invalid format for value: {value}")
# Assign the 2 groups from the RegEx matches
number, unit = match.groups()
unit = unit.upper() # Convert the unit to uppercase
number = float(number) # Convert the values to floats
# Convert the number to bytes
bytes_value = number * (2 ** unit_conversion[unit])
# Convert the bytes value to the target unit
converted_value = bytes_value / (2 ** unit_conversion[target_unit])
# Print the before/after values if show_results is True
if show_results:
print(f"Original: {value} -> Converted: {converted_value:.{decimal}f} {target_unit}")
return converted_value
# Apply the conversion function to values in each column, convert to float
df[col] = df[col].apply(convert_value).astype(float)
# Return a dataframe only if not modifying in place
if inplace:
return None
else:
return df
[docs]
def convert_dtypes(
df: pd.DataFrame,
cols: List[str],
target_dtype: Union[type, str],
show_results: bool = False,
inplace: bool = True
) -> Optional[pd.DataFrame]:
"""
Convert specified columns in a DataFrame to the desired data type.
This function converts the data type of the specified columns in the input
DataFrame to the desired target data type. It supports both base Python data
types (e.g., int, float, str) and Pandas-specific data types (e.g., 'int64',
'float64', 'object', 'bool', 'datetime64', 'timedelta[ns]', 'category'). If
`inplace` is set to True (default), the conversion is done in place, modifying
the original DataFrame. If `inplace` is False, a new DataFrame with the
converted columns is returned. If `show_results` is set to True, it will print
the results of each successful conversion and any error messages for columns
that could not be converted.
Use this function when you need to convert the data types of specific columns
in a DataFrame to a consistent target data type, especially when dealing with
multiple columns at once and identifying columns that require further data
cleaning.
Parameters
----------
df : pd.DataFrame
The DataFrame containing the columns to be converted.
cols : List[str]
List of column names in `df` to be converted.
target_dtype : type or str
The desired data type for the conversion. Can be a base Python data type
(e.g., int, float, str) or a string representation of a Pandas data type
(e.g., 'int64', 'float64', 'object', 'bool', 'datetime64', 'timedelta[ns]',
'category').
show_results : bool, optional
If True, will print the results of each successful conversion and any error
messages for columns that could not be converted. Default is False.
inplace : bool, optional
If True (default), the conversion is done in place, modifying the original
DataFrame. If False, a new DataFrame with the converted columns is
returned.
Returns
-------
pd.DataFrame or None
If `inplace` is False, returns a new DataFrame with the specified columns
converted to the target data type. If `inplace` is True, returns None as
the original DataFrame is modified in place.
Examples
--------
Prepare data for examples:
>>> df = pd.DataFrame({
... 'A': [1, 2, 3],
... 'B': ['0', '23.4 MB', '3.71 GB'],
... 'C': ['4.5', '3.7', '12.15'],
... 'D': [True, False, True],
... 'E': ['Yes', 'No', ''],
... 'F': ['Low', 'Medium', 'High']
... })
>>> num_columns = ['A', 'B', 'C']
>>> cat_columns = ['D', 'E', 'F']
>>> df.dtypes
A int64
B object
C object
D bool
E object
F object
dtype: object
Example 1: Convert columns 'A', 'B', and 'C' to float and show the results:
>>> convert_dtypes(df, num_columns, 'float')
Error converting column: B (Current dtype: object). Error message: could not convert string to float: '23.4 MB'
>>> df.dtypes
A float64
B object
C float64
D bool
E object
F object
dtype: object
Example 2: Convert columns 'D', 'E', and 'F' to category, show the results, and
return a new DataFrame:
>>> new_df = convert_dtypes(df, cat_columns, 'category', inplace=False,
... show_results=True)
Successfully converted column 'D' from bool to category.
Successfully converted column 'E' from object to category.
Successfully converted column 'F' from object to category.
>>> new_df.dtypes
A float64
B object
C float64
D category
E category
F category
dtype: object
"""
if not inplace:
df = df.copy()
for col in cols:
try:
current_dtype = df[col].dtype
if isinstance(target_dtype, str):
df[col] = df[col].astype(target_dtype)
else:
df[col] = df[col].astype(target_dtype)
if show_results:
print(f"Successfully converted column '{col}' from {current_dtype} to {df[col].dtype}.")
except (ValueError, TypeError) as e:
print(f"Error converting column: {col} (Current dtype: {current_dtype}). Error message: {e}")
if not inplace:
return df
[docs]
def convert_time_values(
df: pd.DataFrame,
cols: List[str],
target_format: str = '%Y-%m-%d %H:%M:%S',
target_dtype: Union[str, type, np.dtype] = 'datetime64[ns]',
show_results: bool = False,
inplace: bool = False,
zero_to_nan: bool = False,
pattern_list: Optional[List[str]] = None
) -> Optional[pd.DataFrame]:
"""
Convert time values in columns of a DataFrame to a target format and data type.
This function converts time values in the specified columns of the input
DataFrame to the desired target format and data type. If `inplace` is set to
True, the conversion is done in place, modifying the original DataFrame. If
`inplace` is False (default), a new DataFrame with the converted values is
returned.
The function can handle time values in various formats, including:
1. Excel serial format (e.g., '45161.23458')
2. String format (e.g., 'YYYY-MM-DD')
3. UNIX epoch in milliseconds (e.g., '1640304000000.0')
If your format is not supported, you can define `pattern_list` as a list of
custom datetime patterns.
If `zero_to_nan` is set to True, values of '0', '0.0', '0.00', 0, 0.0, or 0.00
will be replaced with NaN. Otherwise, zero values will be detected as a
Unix Epoch format with value 1970-01-01 00:00:00.
You can use the default `target_format` of '%Y-%m-%d %H:%M:%S', or specify
a different format. The `target_dtype` parameter allows you to specify the
desired data type for the converted columns. When working with formatted date
strings, the applicable data types are 'str', 'object', or 'datetime64[ns]'.
- If `target_dtype` is set to 'str' or 'object', the converted data will be
stored as strings in the specified `target_format`. Missing values will be
represented as 'NaN'.
- If `target_dtype` is set to 'datetime64[ns]', the converted data will be
stored as pandas datetime objects. The `target_format` has no effect in
this scenario. Missing values will be represented as 'NaT'.
To display a summary of the changes made, set `show_results` to True.
Parameters
----------
df : pd.DataFrame
The DataFrame containing the columns to be converted.
cols : List[str]
List of column names in `df` to apply the conversion.
target_format : str, optional
The desired datetime format for the conversion. Uses format codes such as:
%Y: 4-digit year, %m: Month as zero-padded decimal, %d: Day of the month,
%H: Hour (24-hour clock), %M: Minute, %S: Second.
Default format is '%Y-%m-%d %H:%M:%S'.
target_dtype : Union[str, type, np.dtype], optional
The desired data type for the converted columns. Applicable data types are
'str', 'object', or 'datetime64[ns]'. Default is 'datetime64[ns]'.
show_results : bool, optional
If set to True, prints the before-and-after conversion values for each
cell. Default is False.
inplace : bool, optional
If set to True, the conversion is done in place, modifying the original
DataFrame. If False (default), a new DataFrame with the converted values
is returned.
zero_to_nan : bool, optional
If set to True, values of '0', '0.0', '0.00', 0, 0.0, or 0.00 will be
replaced with NaN or NaT, depending on `target_dtype`. Default is False.
pattern_list : List[str], optional
A list of custom datetime patterns to override the default patterns.
If provided, it will be used instead of the default patterns.
Default is None.
Returns
-------
pd.DataFrame or None
If `inplace` is False (default), returns a new DataFrame with the specified
columns converted to the target format and data type. If `inplace` is True,
returns None as the original DataFrame is modified in place.
Examples
--------
Prepare some sloppy data for the examples:
>>> df = pd.DataFrame({
... 'A': ['45161.23458', '2019-02-09', '1640304000000.0'],
... 'B': ['2022-01-01', '45000.5', '1577836800000.0'],
... 'C': ['0', '45161.23458', '2019-02-09']
... })
>>> cols = ['A', 'B', 'C']
Example 1: Convert time values in specified columns to the default format
and data type:
>>> df_converted = convert_time_values(df, cols)
>>> df_converted
A B C
0 2023-08-25 05:37:47 2022-01-01 00:00:00 1970-01-01 00:00:00
1 2019-02-09 00:00:00 2023-03-17 12:00:00 2023-08-25 05:37:47
2 2021-12-24 00:00:00 2020-01-01 00:00:00 2019-02-09 00:00:00
>>> df_converted.dtypes
A datetime64[ns]
B datetime64[ns]
C datetime64[ns]
dtype: object
Example 2: Convert time values in specified columns to the default format
and data type, showing a summary of changes:
>>> df_converted = convert_time_values(df, cols, show_results=True)
Original: 45161.23458 (Excel Serial) -> Converted: 2023-08-25 05:37:47
Original: 2019-02-09 (Standard Datetime String) -> Converted: 2019-02-09 00:00:00
Original: 1640304000000.0 (UNIX Epoch in milliseconds) -> Converted: 2021-12-24 00:00:00
Original: 2022-01-01 (Standard Datetime String) -> Converted: 2022-01-01 00:00:00
Original: 45000.5 (Excel Serial) -> Converted: 2023-03-17 12:00:00
Original: 1577836800000.0 (UNIX Epoch in milliseconds) -> Converted: 2020-01-01 00:00:00
Original: 0 (UNIX Epoch) -> Converted: 1970-01-01 00:00:00
Original: 45161.23458 (Excel Serial) -> Converted: 2023-08-25 05:37:47
Original: 2019-02-09 (Standard Datetime String) -> Converted: 2019-02-09 00:00:00
>>> df_converted
A B C
0 2023-08-25 05:37:47 2022-01-01 00:00:00 1970-01-01 00:00:00
1 2019-02-09 00:00:00 2023-03-17 12:00:00 2023-08-25 05:37:47
2 2021-12-24 00:00:00 2020-01-01 00:00:00 2019-02-09 00:00:00
Example 3: Convert time values to a custom format and data type in place,
replacing zeros with NaN, and showing a summary of changes:
>>> convert_time_values(df, cols, target_format='%d/%m/%Y', target_dtype='str',
... inplace=True, show_results=True, zero_to_nan=True)
Original: 45161.23458 (Excel Serial) -> Converted: 25/08/2023
Original: 2019-02-09 (Standard Datetime String) -> Converted: 09/02/2019
Original: 1640304000000.0 (UNIX Epoch in milliseconds) -> Converted: 24/12/2021
Original: 2022-01-01 (Standard Datetime String) -> Converted: 01/01/2022
Original: 45000.5 (Excel Serial) -> Converted: 17/03/2023
Original: 1577836800000.0 (UNIX Epoch in milliseconds) -> Converted: 01/01/2020
Original: 0 (Zero) -> Converted: NaN
Original: 45161.23458 (Excel Serial) -> Converted: 25/08/2023
Original: 2019-02-09 (Standard Datetime String) -> Converted: 09/02/2019
>>> df
A B C
0 25/08/2023 01/01/2022 NaN
1 09/02/2019 17/03/2023 25/08/2023
2 24/12/2021 01/01/2020 09/02/2019
>>> df.dtypes
A object
B object
C object
dtype: object
"""
# Default datetime patterns
default_patterns = [
r"^\d{4}-\d{2}-\d{2}$", # YYYY-MM-DD
r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$", # YYYY-MM-DD HH:MM:SS
r"^\d{2}/\d{2}/\d{4}$", # MM/DD/YYYY
]
# Use the provided pattern_list if passed as a parameter
patterns = pattern_list if pattern_list is not None else default_patterns
# Set zero patterns to look for
zero_patterns = ['0', '0.0', '0.00', 0, 0.0, 0.00]
# Copy dataframe if not modifying in place
if not inplace:
df = df.copy()
# Iterate through the columns
for col in cols:
# Function to convert a time value
def convert_value(value):
# If value is NaN or NaT, return the appropriate missing value based on target_dtype
if pd.isna(value) or value is pd.NaT:
if target_dtype == 'datetime64[ns]':
converted_value = pd.NaT
else:
converted_value = 'NaN'
if show_results:
print(f"Original: {value} -> Converted: {converted_value}")
return converted_value
# If zero_to_nan is True and value matches zero patterns, return the appropriate missing value based on target_dtype
if zero_to_nan and str(value) in zero_patterns:
if target_dtype == 'datetime64[ns]':
converted_value = pd.NaT
else:
converted_value = 'NaN'
if show_results:
print(f"Original: {value} (Zero) -> Converted: {converted_value}")
return converted_value
detected_format = None
try:
# Convert Excel Serial to datetime
if isinstance(value, (float, int)) or (isinstance(value, str) and "." in value):
float_value = float(value)
if 40000 < float_value < 50000: # Typical range for recent Excel serials
datetime_val = pd.Timestamp('1900-01-01') + pd.to_timedelta(float_value, 'D')
detected_format = "Excel Serial"
else:
datetime_val = pd.to_datetime(float(value), unit='ms')
detected_format = "UNIX Epoch in milliseconds"
# If value is '0', convert it to Unix epoch
elif str(value) in zero_patterns:
datetime_val = pd.to_datetime(0, unit='s')
detected_format = "UNIX Epoch"
# Assume it's already in a recognizable format (like 'YYYY-MM-DD')
elif isinstance(value, str):
if any(re.match(pattern, value) for pattern in patterns):
datetime_val = pd.to_datetime(value)
detected_format = "Standard Datetime String"
else:
raise ValueError(f"Unrecognized format for value: {value}")
# Format conversion using the specified target_format
formatted_datetime = datetime_val.strftime(target_format)
except Exception as e:
raise ValueError(f"Error converting value: {value}. Additional info: {e}")
if show_results:
print(f"Original: {value} ({detected_format}) -> Converted: {formatted_datetime}")
return formatted_datetime
# Apply the conversion function to values in each column
df[col] = df[col].apply(convert_value)
# Convert the modified columns to the specified data type
for col in cols:
df[col] = df[col].astype(target_dtype)
# Return a dataframe only if not modifying in place
if inplace:
return None
else:
return df
[docs]
def reduce_multicollinearity(
df: DataFrame,
target_col: str,
corr_threshold: float = 0.9,
consider_nan: bool = False,
consider_zero: bool = False,
diff_threshold: float = 0.1,
decimal: int = 2
) -> DataFrame:
"""
Reduce multicollinearity in a DataFrame by removing highly correlated features.
This function iteratively evaluates pairs of features in a DataFrame based on
their correlation to each other and to a specified target column. If two
features are highly correlated (above `corr_threshold`), the one with the lower
correlation to the target column is removed. The number of NaN and/or zero
values can also be considered (prefering removal of features with more) by
setting `consider_nan` or `consider_zero` to True. The threshold for
significant differences (`diff_threshold`) can also be adjusted. Sometimes it
might appear as if the correlations are the same, but it says one is greater.
Adjust `decimal` to a larger number to see more precision in the correlation.
Use this function to remove redundant features, and reduce a large feature set
to a smaller one that contains the features most correlated with the target.
This should improve the model's ability to learn from the dataset, improve
performance, and increase interpretability of results.
Parameters
----------
df : DataFrame
The DataFrame to evaluate. It should have features and the target column.
target_col : str
The name of the target column against which feature correlations are
evaluated.
corr_threshold : float, optional
The correlation threshold above which a pair of features is considered
highly correlated. Default is 0.9.
consider_nan : bool, optional
If True, considers the number of NaN values in the decision process.
Default is False.
consider_zero : bool, optional
If True, considers the number of zero values in the decision process.
Default is False.
diff_threshold : float, optional
The threshold for considering the difference in NaN/zero counts as
significant. Default is 0.1.
decimal : int, optional
The number of decimals to round to when displaying output. Default is 2.
Returns
-------
DataFrame
A modified DataFrame with reduced multicollinearity, containing only the
features that were kept after evaluation.
Examples
--------
Prepare the data for the examples:
>>> np.random.seed(0) # For reproducibility
>>> A = np.random.rand(100)
>>> B = A + np.random.normal(0, 0.01, 100) # B is highly correlated with A
>>> C = np.random.rand(100)
>>> D = B + np.random.rand(100) # D is highly correlated with B
>>> Target = A * 0.1 + C * 0.5 + D * 0.8 + np.random.normal(0, 0.1, 100)
>>> df = pd.DataFrame({'A': A, 'B': B, 'C': C, 'D': D, 'Target': Target})
Example 1: Reduce multicollinearity in a DataFrame with correlated features
and a target column:
>>> reduced_df = reduce_multicollinearity(df, 'Target', decimal=4)
Evaluating pair: 'B' and 'A' (1.0) - 5 kept features
- Correlation with target: 0.6509, 0.6468
- Keeping 'B' (higher correlation, lower or equal count)
<BLANKLINE>
Example 2: Reduce multicollinearity with a lower correlation threshold and
considering NaN and Zero values:
>>> reduced_df = reduce_multicollinearity(df, 'Target', corr_threshold=0.6,
... consider_zero=True, consider_nan=True, diff_threshold=0.2)
Evaluating pair: 'D' and 'A' (0.66) - 5 kept features
- Correlation with target: 0.86, 0.65
- NaN/0 counts: 0, 0
- Keeping 'D' (higher correlation, lower or equal count)
<BLANKLINE>
Evaluating pair: 'D' and 'B' (0.66) - 4 kept features
- Correlation with target: 0.86, 0.65
- NaN/0 counts: 0, 0
- Keeping 'D' (higher correlation, lower or equal count)
<BLANKLINE>
"""
# Initial set up
original_features = set(df.columns)
kept_features = original_features.copy()
# Initialize consider_text
if consider_nan and consider_zero:
consider_text = 'NaN/0'
elif consider_nan:
consider_text = 'NaN'
elif consider_zero:
consider_text = '0'
else:
consider_text = 'None'
while True:
kept_features_list = list(kept_features)
corr_matrix = df[kept_features_list].corr()
target_correlations = corr_matrix[target_col].abs().sort_values(ascending=False)
changes_made = False
evaluated_pairs = set()
for feature in target_correlations.index.drop(target_col):
if feature not in kept_features:
continue # Skip if feature already removed
feature_removed = False # Initialize flag to track if 'feature' is removed
high_corr_features = corr_matrix[feature][corr_matrix[feature].abs() >= corr_threshold].index.drop(feature)
high_corr_features = high_corr_features.difference(evaluated_pairs)
high_corr_features = sorted(high_corr_features)
for other_feature in high_corr_features:
if other_feature not in kept_features or other_feature == target_col:
continue # Skip if other feature already removed or is target
evaluated_pairs.add((feature, other_feature)) # Mark this pair as evaluated
evaluated_pairs.add((other_feature, feature)) # Experimental: does this help?
feature_pair_corr = corr_matrix.at[feature, other_feature]
corr_with_target_feature = target_correlations[feature]
corr_with_target_other = target_correlations[other_feature]
nan_count_feature = df[feature].isna().sum() if consider_nan else 0
zero_count_feature = (df[feature] == 0).sum() if consider_zero else 0
nan_count_other = df[other_feature].isna().sum() if consider_nan else 0
zero_count_other = (df[other_feature] == 0).sum() if consider_zero else 0
total_count_feature = nan_count_feature + zero_count_feature
total_count_other = nan_count_other + zero_count_other
# Evaluate if significant difference in counts
max_count = max(total_count_feature, total_count_other)
if max_count == 0: # Avoid division by zero
sig_diff = False
else:
relative_difference = abs(total_count_feature - total_count_other) / max_count
sig_diff = relative_difference > diff_threshold
# Print decision process
print(f"Evaluating pair: '{feature}' and '{other_feature}' ({round(feature_pair_corr, 2)}) - {len(kept_features)} kept features")
print(f" - Correlation with target: {corr_with_target_feature:.{decimal}f}, {corr_with_target_other:.{decimal}f}")
if consider_nan or consider_zero:
print(f" - {consider_text} counts: {total_count_feature}, {total_count_other}")
# Logic to decide which feature to keep
if corr_with_target_feature > corr_with_target_other and total_count_feature <= total_count_other:
print(f" - Keeping '{feature}' (higher correlation, lower or equal count)\n")
if other_feature in kept_features:
kept_features.remove(other_feature)
changes_made = True
else:
print(f"{other_feature} already removed\n")
elif corr_with_target_feature > corr_with_target_other and not sig_diff:
print(f" - Keeping '{feature}' (higher correlation, no significant diff: {relative_difference:.{decimal}f} <= {diff_threshold:.{decimal}f})\n")
if other_feature in kept_features:
kept_features.remove(other_feature)
changes_made = True
else:
print(f"{other_feature} already removed\n")
elif corr_with_target_feature > corr_with_target_other and sig_diff:
print(f" - Keeping '{other_feature}' (higher correlation, significant diff: {relative_difference:.{decimal}f} > {diff_threshold:.{decimal}f})\n")
if feature in kept_features:
kept_features.remove(feature)
changes_made = True
feature_removed = True
else:
print(f"{feature} already removed\n")
elif corr_with_target_feature == corr_with_target_other and total_count_feature <= total_count_other:
print(f" - Keeping '{feature}' (equal correlation, lower or equal count)\n")
if other_feature in kept_features:
kept_features.remove(other_feature)
changes_made = True
else:
print(f"{other_feature} already removed\n")
elif corr_with_target_feature == corr_with_target_other and total_count_feature > total_count_other:
print(f" - Keeping '{other_feature}' (equal correlation, higher count)\n")
if feature in kept_features:
kept_features.remove(feature)
changes_made = True
feature_removed = True
else:
print(f"{feature} already removed\n")
elif corr_with_target_feature < corr_with_target_other and total_count_feature == total_count_other:
print(f" - Keeping '{other_feature}' (lower correlation, equal count)\n")
if feature in kept_features:
kept_features.remove(feature)
changes_made = True
feature_removed = True
else:
print(f"{feature} already removed\n")
elif corr_with_target_feature < corr_with_target_other and not sig_diff:
print(f" - Keeping '{other_feature}' (lower correlation, no significant diff: {relative_difference:.{decimal}f} <= {diff_threshold:.{decimal}f})\n")
if feature in kept_features:
kept_features.remove(feature)
changes_made = True
feature_removed = True
else:
print(f"{feature} already removed\n")
else:
print(f" - Keeping '{feature}' (otherwise, go with first feature)\n")
if other_feature in kept_features:
kept_features.remove(other_feature)
changes_made = True
else:
print(f"{other_feature} already removed\n")
if feature_removed:
break # Break out of the first for loop if the feature was removed
if not changes_made:
# No more features to evaluate, end the loop
break
# Prepare final dataframe with kept features
to_drop = original_features - kept_features
reduced_df = df.drop(columns=to_drop)
return reduced_df
[docs]
def split_outliers(
df: pd.DataFrame,
columns: Optional[List[str]] = None,
iqr_multiplier: float = 1.5
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Split a DataFrame into two based on the presence of outliers.
This function identifies outliers in the specified columns of the
input DataFrame using the Interquartile Range (IQR) method. It then
splits the DataFrame into two: one containing rows without outliers
and another containing only the rows with outliers.
Use this function when you need to separate outliers from the main
data for further analysis or processing.
Parameters
----------
df : pd.DataFrame
The input DataFrame containing the data to be split.
columns : List[str], optional
List of column names to consider for outlier detection. If None,
all columns in the DataFrame will be considered. Default is None.
iqr_multiplier : float, optional
The multiplier for the IQR range to determine outliers. Default
is 1.5.
Returns
-------
Tuple[pd.DataFrame, pd.DataFrame]
A tuple containing two DataFrames: The first DataFrame (df_no_outliers)
contains rows without outliers. The second DataFrame (df_outliers)
contains only the rows with outliers.
Examples
--------
Prepare the data for the examples:
>>> df = pd.DataFrame({
... 'A': [1, 2, 3, 4, 5, 100],
... 'B': [10, 20, 30, 40, 50, 600],
... 'C': [-30, 4, 3, 2, 1, 3]
... })
Example 1: Split outliers considering all columns:
>>> df_no_outliers, df_outliers = split_outliers(df)
>>> df_no_outliers
A B C
1 2 20 4
2 3 30 3
3 4 40 2
4 5 50 1
>>> df_outliers
A B C
0 1 10 -30
5 100 600 3
Example 2: Split outliers considering specific columns:
>>> df_no_outliers, df_outliers = split_outliers(df, columns=['A', 'B'])
>>> df_no_outliers
A B C
0 1 10 -30
1 2 20 4
2 3 30 3
3 4 40 2
4 5 50 1
>>> df_outliers
A B C
5 100 600 3
"""
# If columns parameter is not provided, use all columns in the DataFrame
if columns is None:
columns = df.columns
# Create an initial mask with all False values (meaning no outliers)
outlier_mask = pd.Series(False, index=df.index)
# For each specified column, update the outlier mask to mark outliers
for col in columns:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
# Update mask for outliers in the current column
outlier_mask |= (df[col] < (Q1 - iqr_multiplier * IQR)) | \
(df[col] > (Q3 + iqr_multiplier * IQR))
# Use the mask to split the data
df_no_outliers = df[~outlier_mask]
df_outliers = df[outlier_mask]
return df_no_outliers, df_outliers