Source code for datawaza.clean

# clean.py – Clean module of Datawaza
#
# Datawaza  Copyright (C) 2024  Jim Beno
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details:
# https://github.com/jbeno/datawaza/blob/main/LICENSE
"""
This module provides tools to clean data in preparation for modeling.
It contains functions to convert data types, convert unites of measurement,
convert time values, reduce multicollinearity, and split out the outliers from
a dataset.

Functions:
    - :func:`~datawaza.clean.convert_data_values` - Convert mixed data values (ex: GB, MB, KB) to a common unit of measurement.
    - :func:`~datawaza.clean.convert_dtypes` - Convert specified columns in a DataFrame to the desired data type.
    - :func:`~datawaza.clean.convert_time_values` - Convert time values in specified columns of a DataFrame to a target format.
    - :func:`~datawaza.clean.reduce_multicollinearity` - Reduce multicollinearity in a DataFrame by removing highly correlated features.
    - :func:`~datawaza.clean.split_outliers` - Split a DataFrame into two based on the presence of outliers.
"""

# Metadata
__author__ = "Jim Beno"
__email__ = "jim@jimbeno.net"
__version__ = "0.1.3"
__license__ = "GNU GPLv3"

# Standard library imports
import re

# Data manipulation and analysis
import numpy as np
import pandas as pd
from pandas import DataFrame

# Typing imports
from typing import Optional, Union, Tuple, List, Dict, Any


# Functions
[docs] def convert_data_values( df: pd.DataFrame, cols: List[str], target_unit: str = 'MB', show_results: bool = False, inplace: bool = False, decimal: int = 4, conversion_dict: Optional[Dict[str, int]] = None ) -> Optional[pd.DataFrame]: """ Convert mixed data values (ex: GB, MB, KB) to a common unit of measurement. This function converts values in the specified columns of the input DataFrame to the desired target unit. If `inplace` is set to True, the conversion is done in place, modifying the original DataFrame. If `inplace` is False (default), a new DataFrame with the converted values is returned. The string suffix is dropped and the column is converted to a float. It handles inconsistent suffix strings, with or without spaces after the numbers (ex: '10GB', '10 Gb'). A variety of spelling options are supported (ex: 'GB', 'Gigabytes'), but you can pass a custom dictionary as `conversion_dict` if desired. To display a summary of the changes made, set `show_results` to True. Use this to clean up messy data that has a variety of units of measurement appended as text strings to the numeric values. The result will be columns with a common unit of measurement as floats (with no text suffixes). Parameters ---------- df : pd.DataFrame The DataFrame containing the columns to be converted. cols : List[str] List of column names in `df` to apply the conversion. target_unit : str, optional The target unit for the conversion. Default is 'MB'. Possible values are: 'B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB'. show_results : bool, optional If set to True, prints the before-and-after conversion values for each cell. Default is False. inplace : bool, optional If set to True, the conversion is done in place, modifying the original DataFrame. If False (default), a new DataFrame with the converted values is returned. decimal : int, optional The number of decimal places to show when `show_results` is set to True. Default is 4. conversion_dict : Dict[str, int], optional A custom dictionary mapping units to their corresponding powers of 2. If provided, it will override the default `unit_conversion` dictionary. Default is None. Returns ------- pd.DataFrame or None If `inplace` is False (default), returns a new DataFrame with the specified columns converted to the target unit. If `inplace` is True, returns None as the original DataFrame is modified in place. Examples -------- Prepare some sloppy data for the examples: >>> df = pd.DataFrame({ ... 'A': ['5 gb', '500 kb', '2tb'], ... 'B': ['3GB', '0.5 MB', '200KB'], ... 'C': ['10 Gigabytes', '250 kilo', '1 PB'] ... }) >>> cols = ['A', 'B', 'C'] Example 1: Convert values in specified columns to GB and assign to a new df: >>> df_converted = convert_data_values(df, cols, target_unit='GB') >>> df_converted A B C 0 5.000000 3.000000 1.000000e+01 1 0.000477 0.000488 2.384186e-04 2 2048.000000 0.000191 1.048576e+06 Example 2: Convert data values to MB in place, modifying the existing df, and show a summary of the changes: >>> convert_data_values(df, cols, target_unit='MB', inplace=True, ... show_results=True, decimal=8) Original: 5 gb -> Converted: 5120.00000000 MB Original: 500 kb -> Converted: 0.48828125 MB Original: 2tb -> Converted: 2097152.00000000 MB Original: 3GB -> Converted: 3072.00000000 MB Original: 0.5 MB -> Converted: 0.50000000 MB Original: 200KB -> Converted: 0.19531250 MB Original: 10 Gigabytes -> Converted: 10240.00000000 MB Original: 250 kilo -> Converted: 0.24414062 MB Original: 1 PB -> Converted: 1073741824.00000000 MB >>> df A B C 0 5.120000e+03 3072.000000 1.024000e+04 1 4.882812e-01 0.500000 2.441406e-01 2 2.097152e+06 0.195312 1.073742e+09 """ # Default conversion factors based on powers of 2 default_unit_conversion = { 'B': 0, 'BYTE': 0, 'BYTES': 0, 'KB': 10, 'KILOBYTE': 10, 'KILOBYTES': 10, 'KILO': 10, 'MB': 20, 'MEGABYTE': 20, 'MEGABYTES': 20, 'MEGA': 20, 'GB': 30, 'GIGABYTE': 30, 'GIGABYTES': 30, 'GIGA': 30, 'TB': 40, 'TERABYTE': 40, 'TERABYTES': 40, 'TERA': 40, 'PB': 50, 'PETABYTE': 50, 'PETABYTES': 50, 'PETA': 50, 'EB': 60, 'EXABYTE': 60, 'EXABYTES': 60, 'EXA': 60 } # Use the provided conversion_dict if passed as a parameter unit_conversion = conversion_dict if conversion_dict is not None else default_unit_conversion # Convert target_unit to uppercase for case-insensitive comparison target_unit = target_unit.upper() # Copy dataframe if not modifying in place if not inplace: df = df.copy() # Iterate through the columns for col in cols: # Function to convert a data value def convert_value(value): if pd.isna(value): # Handle NaNs if show_results: print(f"Original: NaN -> Converted: NaN") return np.nan # RegEx handles both with spaces or without, ex: '10 GB' and '10GB' match = re.match(r'(\d+\.?\d*)\s?([A-Za-z]+)', value, re.IGNORECASE) if not match: raise ValueError(f"Invalid format for value: {value}") # Assign the 2 groups from the RegEx matches number, unit = match.groups() unit = unit.upper() # Convert the unit to uppercase number = float(number) # Convert the values to floats # Convert the number to bytes bytes_value = number * (2 ** unit_conversion[unit]) # Convert the bytes value to the target unit converted_value = bytes_value / (2 ** unit_conversion[target_unit]) # Print the before/after values if show_results is True if show_results: print(f"Original: {value} -> Converted: {converted_value:.{decimal}f} {target_unit}") return converted_value # Apply the conversion function to values in each column, convert to float df[col] = df[col].apply(convert_value).astype(float) # Return a dataframe only if not modifying in place if inplace: return None else: return df
[docs] def convert_dtypes( df: pd.DataFrame, cols: List[str], target_dtype: Union[type, str], show_results: bool = False, inplace: bool = True ) -> Optional[pd.DataFrame]: """ Convert specified columns in a DataFrame to the desired data type. This function converts the data type of the specified columns in the input DataFrame to the desired target data type. It supports both base Python data types (e.g., int, float, str) and Pandas-specific data types (e.g., 'int64', 'float64', 'object', 'bool', 'datetime64', 'timedelta[ns]', 'category'). If `inplace` is set to True (default), the conversion is done in place, modifying the original DataFrame. If `inplace` is False, a new DataFrame with the converted columns is returned. If `show_results` is set to True, it will print the results of each successful conversion and any error messages for columns that could not be converted. Use this function when you need to convert the data types of specific columns in a DataFrame to a consistent target data type, especially when dealing with multiple columns at once and identifying columns that require further data cleaning. Parameters ---------- df : pd.DataFrame The DataFrame containing the columns to be converted. cols : List[str] List of column names in `df` to be converted. target_dtype : type or str The desired data type for the conversion. Can be a base Python data type (e.g., int, float, str) or a string representation of a Pandas data type (e.g., 'int64', 'float64', 'object', 'bool', 'datetime64', 'timedelta[ns]', 'category'). show_results : bool, optional If True, will print the results of each successful conversion and any error messages for columns that could not be converted. Default is False. inplace : bool, optional If True (default), the conversion is done in place, modifying the original DataFrame. If False, a new DataFrame with the converted columns is returned. Returns ------- pd.DataFrame or None If `inplace` is False, returns a new DataFrame with the specified columns converted to the target data type. If `inplace` is True, returns None as the original DataFrame is modified in place. Examples -------- Prepare data for examples: >>> df = pd.DataFrame({ ... 'A': [1, 2, 3], ... 'B': ['0', '23.4 MB', '3.71 GB'], ... 'C': ['4.5', '3.7', '12.15'], ... 'D': [True, False, True], ... 'E': ['Yes', 'No', ''], ... 'F': ['Low', 'Medium', 'High'] ... }) >>> num_columns = ['A', 'B', 'C'] >>> cat_columns = ['D', 'E', 'F'] >>> df.dtypes A int64 B object C object D bool E object F object dtype: object Example 1: Convert columns 'A', 'B', and 'C' to float and show the results: >>> convert_dtypes(df, num_columns, 'float') Error converting column: B (Current dtype: object). Error message: could not convert string to float: '23.4 MB' >>> df.dtypes A float64 B object C float64 D bool E object F object dtype: object Example 2: Convert columns 'D', 'E', and 'F' to category, show the results, and return a new DataFrame: >>> new_df = convert_dtypes(df, cat_columns, 'category', inplace=False, ... show_results=True) Successfully converted column 'D' from bool to category. Successfully converted column 'E' from object to category. Successfully converted column 'F' from object to category. >>> new_df.dtypes A float64 B object C float64 D category E category F category dtype: object """ if not inplace: df = df.copy() for col in cols: try: current_dtype = df[col].dtype if isinstance(target_dtype, str): df[col] = df[col].astype(target_dtype) else: df[col] = df[col].astype(target_dtype) if show_results: print(f"Successfully converted column '{col}' from {current_dtype} to {df[col].dtype}.") except (ValueError, TypeError) as e: print(f"Error converting column: {col} (Current dtype: {current_dtype}). Error message: {e}") if not inplace: return df
[docs] def convert_time_values( df: pd.DataFrame, cols: List[str], target_format: str = '%Y-%m-%d %H:%M:%S', target_dtype: Union[str, type, np.dtype] = 'datetime64[ns]', show_results: bool = False, inplace: bool = False, zero_to_nan: bool = False, pattern_list: Optional[List[str]] = None ) -> Optional[pd.DataFrame]: """ Convert time values in columns of a DataFrame to a target format and data type. This function converts time values in the specified columns of the input DataFrame to the desired target format and data type. If `inplace` is set to True, the conversion is done in place, modifying the original DataFrame. If `inplace` is False (default), a new DataFrame with the converted values is returned. The function can handle time values in various formats, including: 1. Excel serial format (e.g., '45161.23458') 2. String format (e.g., 'YYYY-MM-DD') 3. UNIX epoch in milliseconds (e.g., '1640304000000.0') If your format is not supported, you can define `pattern_list` as a list of custom datetime patterns. If `zero_to_nan` is set to True, values of '0', '0.0', '0.00', 0, 0.0, or 0.00 will be replaced with NaN. Otherwise, zero values will be detected as a Unix Epoch format with value 1970-01-01 00:00:00. You can use the default `target_format` of '%Y-%m-%d %H:%M:%S', or specify a different format. The `target_dtype` parameter allows you to specify the desired data type for the converted columns. When working with formatted date strings, the applicable data types are 'str', 'object', or 'datetime64[ns]'. - If `target_dtype` is set to 'str' or 'object', the converted data will be stored as strings in the specified `target_format`. Missing values will be represented as 'NaN'. - If `target_dtype` is set to 'datetime64[ns]', the converted data will be stored as pandas datetime objects. The `target_format` has no effect in this scenario. Missing values will be represented as 'NaT'. To display a summary of the changes made, set `show_results` to True. Parameters ---------- df : pd.DataFrame The DataFrame containing the columns to be converted. cols : List[str] List of column names in `df` to apply the conversion. target_format : str, optional The desired datetime format for the conversion. Uses format codes such as: %Y: 4-digit year, %m: Month as zero-padded decimal, %d: Day of the month, %H: Hour (24-hour clock), %M: Minute, %S: Second. Default format is '%Y-%m-%d %H:%M:%S'. target_dtype : Union[str, type, np.dtype], optional The desired data type for the converted columns. Applicable data types are 'str', 'object', or 'datetime64[ns]'. Default is 'datetime64[ns]'. show_results : bool, optional If set to True, prints the before-and-after conversion values for each cell. Default is False. inplace : bool, optional If set to True, the conversion is done in place, modifying the original DataFrame. If False (default), a new DataFrame with the converted values is returned. zero_to_nan : bool, optional If set to True, values of '0', '0.0', '0.00', 0, 0.0, or 0.00 will be replaced with NaN or NaT, depending on `target_dtype`. Default is False. pattern_list : List[str], optional A list of custom datetime patterns to override the default patterns. If provided, it will be used instead of the default patterns. Default is None. Returns ------- pd.DataFrame or None If `inplace` is False (default), returns a new DataFrame with the specified columns converted to the target format and data type. If `inplace` is True, returns None as the original DataFrame is modified in place. Examples -------- Prepare some sloppy data for the examples: >>> df = pd.DataFrame({ ... 'A': ['45161.23458', '2019-02-09', '1640304000000.0'], ... 'B': ['2022-01-01', '45000.5', '1577836800000.0'], ... 'C': ['0', '45161.23458', '2019-02-09'] ... }) >>> cols = ['A', 'B', 'C'] Example 1: Convert time values in specified columns to the default format and data type: >>> df_converted = convert_time_values(df, cols) >>> df_converted A B C 0 2023-08-25 05:37:47 2022-01-01 00:00:00 1970-01-01 00:00:00 1 2019-02-09 00:00:00 2023-03-17 12:00:00 2023-08-25 05:37:47 2 2021-12-24 00:00:00 2020-01-01 00:00:00 2019-02-09 00:00:00 >>> df_converted.dtypes A datetime64[ns] B datetime64[ns] C datetime64[ns] dtype: object Example 2: Convert time values in specified columns to the default format and data type, showing a summary of changes: >>> df_converted = convert_time_values(df, cols, show_results=True) Original: 45161.23458 (Excel Serial) -> Converted: 2023-08-25 05:37:47 Original: 2019-02-09 (Standard Datetime String) -> Converted: 2019-02-09 00:00:00 Original: 1640304000000.0 (UNIX Epoch in milliseconds) -> Converted: 2021-12-24 00:00:00 Original: 2022-01-01 (Standard Datetime String) -> Converted: 2022-01-01 00:00:00 Original: 45000.5 (Excel Serial) -> Converted: 2023-03-17 12:00:00 Original: 1577836800000.0 (UNIX Epoch in milliseconds) -> Converted: 2020-01-01 00:00:00 Original: 0 (UNIX Epoch) -> Converted: 1970-01-01 00:00:00 Original: 45161.23458 (Excel Serial) -> Converted: 2023-08-25 05:37:47 Original: 2019-02-09 (Standard Datetime String) -> Converted: 2019-02-09 00:00:00 >>> df_converted A B C 0 2023-08-25 05:37:47 2022-01-01 00:00:00 1970-01-01 00:00:00 1 2019-02-09 00:00:00 2023-03-17 12:00:00 2023-08-25 05:37:47 2 2021-12-24 00:00:00 2020-01-01 00:00:00 2019-02-09 00:00:00 Example 3: Convert time values to a custom format and data type in place, replacing zeros with NaN, and showing a summary of changes: >>> convert_time_values(df, cols, target_format='%d/%m/%Y', target_dtype='str', ... inplace=True, show_results=True, zero_to_nan=True) Original: 45161.23458 (Excel Serial) -> Converted: 25/08/2023 Original: 2019-02-09 (Standard Datetime String) -> Converted: 09/02/2019 Original: 1640304000000.0 (UNIX Epoch in milliseconds) -> Converted: 24/12/2021 Original: 2022-01-01 (Standard Datetime String) -> Converted: 01/01/2022 Original: 45000.5 (Excel Serial) -> Converted: 17/03/2023 Original: 1577836800000.0 (UNIX Epoch in milliseconds) -> Converted: 01/01/2020 Original: 0 (Zero) -> Converted: NaN Original: 45161.23458 (Excel Serial) -> Converted: 25/08/2023 Original: 2019-02-09 (Standard Datetime String) -> Converted: 09/02/2019 >>> df A B C 0 25/08/2023 01/01/2022 NaN 1 09/02/2019 17/03/2023 25/08/2023 2 24/12/2021 01/01/2020 09/02/2019 >>> df.dtypes A object B object C object dtype: object """ # Default datetime patterns default_patterns = [ r"^\d{4}-\d{2}-\d{2}$", # YYYY-MM-DD r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$", # YYYY-MM-DD HH:MM:SS r"^\d{2}/\d{2}/\d{4}$", # MM/DD/YYYY ] # Use the provided pattern_list if passed as a parameter patterns = pattern_list if pattern_list is not None else default_patterns # Set zero patterns to look for zero_patterns = ['0', '0.0', '0.00', 0, 0.0, 0.00] # Copy dataframe if not modifying in place if not inplace: df = df.copy() # Iterate through the columns for col in cols: # Function to convert a time value def convert_value(value): # If value is NaN or NaT, return the appropriate missing value based on target_dtype if pd.isna(value) or value is pd.NaT: if target_dtype == 'datetime64[ns]': converted_value = pd.NaT else: converted_value = 'NaN' if show_results: print(f"Original: {value} -> Converted: {converted_value}") return converted_value # If zero_to_nan is True and value matches zero patterns, return the appropriate missing value based on target_dtype if zero_to_nan and str(value) in zero_patterns: if target_dtype == 'datetime64[ns]': converted_value = pd.NaT else: converted_value = 'NaN' if show_results: print(f"Original: {value} (Zero) -> Converted: {converted_value}") return converted_value detected_format = None try: # Convert Excel Serial to datetime if isinstance(value, (float, int)) or (isinstance(value, str) and "." in value): float_value = float(value) if 40000 < float_value < 50000: # Typical range for recent Excel serials datetime_val = pd.Timestamp('1900-01-01') + pd.to_timedelta(float_value, 'D') detected_format = "Excel Serial" else: datetime_val = pd.to_datetime(float(value), unit='ms') detected_format = "UNIX Epoch in milliseconds" # If value is '0', convert it to Unix epoch elif str(value) in zero_patterns: datetime_val = pd.to_datetime(0, unit='s') detected_format = "UNIX Epoch" # Assume it's already in a recognizable format (like 'YYYY-MM-DD') elif isinstance(value, str): if any(re.match(pattern, value) for pattern in patterns): datetime_val = pd.to_datetime(value) detected_format = "Standard Datetime String" else: raise ValueError(f"Unrecognized format for value: {value}") # Format conversion using the specified target_format formatted_datetime = datetime_val.strftime(target_format) except Exception as e: raise ValueError(f"Error converting value: {value}. Additional info: {e}") if show_results: print(f"Original: {value} ({detected_format}) -> Converted: {formatted_datetime}") return formatted_datetime # Apply the conversion function to values in each column df[col] = df[col].apply(convert_value) # Convert the modified columns to the specified data type for col in cols: df[col] = df[col].astype(target_dtype) # Return a dataframe only if not modifying in place if inplace: return None else: return df
[docs] def reduce_multicollinearity( df: DataFrame, target_col: str, corr_threshold: float = 0.9, consider_nan: bool = False, consider_zero: bool = False, diff_threshold: float = 0.1, decimal: int = 2 ) -> DataFrame: """ Reduce multicollinearity in a DataFrame by removing highly correlated features. This function iteratively evaluates pairs of features in a DataFrame based on their correlation to each other and to a specified target column. If two features are highly correlated (above `corr_threshold`), the one with the lower correlation to the target column is removed. The number of NaN and/or zero values can also be considered (prefering removal of features with more) by setting `consider_nan` or `consider_zero` to True. The threshold for significant differences (`diff_threshold`) can also be adjusted. Sometimes it might appear as if the correlations are the same, but it says one is greater. Adjust `decimal` to a larger number to see more precision in the correlation. Use this function to remove redundant features, and reduce a large feature set to a smaller one that contains the features most correlated with the target. This should improve the model's ability to learn from the dataset, improve performance, and increase interpretability of results. Parameters ---------- df : DataFrame The DataFrame to evaluate. It should have features and the target column. target_col : str The name of the target column against which feature correlations are evaluated. corr_threshold : float, optional The correlation threshold above which a pair of features is considered highly correlated. Default is 0.9. consider_nan : bool, optional If True, considers the number of NaN values in the decision process. Default is False. consider_zero : bool, optional If True, considers the number of zero values in the decision process. Default is False. diff_threshold : float, optional The threshold for considering the difference in NaN/zero counts as significant. Default is 0.1. decimal : int, optional The number of decimals to round to when displaying output. Default is 2. Returns ------- DataFrame A modified DataFrame with reduced multicollinearity, containing only the features that were kept after evaluation. Examples -------- Prepare the data for the examples: >>> np.random.seed(0) # For reproducibility >>> A = np.random.rand(100) >>> B = A + np.random.normal(0, 0.01, 100) # B is highly correlated with A >>> C = np.random.rand(100) >>> D = B + np.random.rand(100) # D is highly correlated with B >>> Target = A * 0.1 + C * 0.5 + D * 0.8 + np.random.normal(0, 0.1, 100) >>> df = pd.DataFrame({'A': A, 'B': B, 'C': C, 'D': D, 'Target': Target}) Example 1: Reduce multicollinearity in a DataFrame with correlated features and a target column: >>> reduced_df = reduce_multicollinearity(df, 'Target', decimal=4) Evaluating pair: 'B' and 'A' (1.0) - 5 kept features - Correlation with target: 0.6509, 0.6468 - Keeping 'B' (higher correlation, lower or equal count) <BLANKLINE> Example 2: Reduce multicollinearity with a lower correlation threshold and considering NaN and Zero values: >>> reduced_df = reduce_multicollinearity(df, 'Target', corr_threshold=0.6, ... consider_zero=True, consider_nan=True, diff_threshold=0.2) Evaluating pair: 'D' and 'A' (0.66) - 5 kept features - Correlation with target: 0.86, 0.65 - NaN/0 counts: 0, 0 - Keeping 'D' (higher correlation, lower or equal count) <BLANKLINE> Evaluating pair: 'D' and 'B' (0.66) - 4 kept features - Correlation with target: 0.86, 0.65 - NaN/0 counts: 0, 0 - Keeping 'D' (higher correlation, lower or equal count) <BLANKLINE> """ # Initial set up original_features = set(df.columns) kept_features = original_features.copy() # Initialize consider_text if consider_nan and consider_zero: consider_text = 'NaN/0' elif consider_nan: consider_text = 'NaN' elif consider_zero: consider_text = '0' else: consider_text = 'None' while True: kept_features_list = list(kept_features) corr_matrix = df[kept_features_list].corr() target_correlations = corr_matrix[target_col].abs().sort_values(ascending=False) changes_made = False evaluated_pairs = set() for feature in target_correlations.index.drop(target_col): if feature not in kept_features: continue # Skip if feature already removed feature_removed = False # Initialize flag to track if 'feature' is removed high_corr_features = corr_matrix[feature][corr_matrix[feature].abs() >= corr_threshold].index.drop(feature) high_corr_features = high_corr_features.difference(evaluated_pairs) high_corr_features = sorted(high_corr_features) for other_feature in high_corr_features: if other_feature not in kept_features or other_feature == target_col: continue # Skip if other feature already removed or is target evaluated_pairs.add((feature, other_feature)) # Mark this pair as evaluated evaluated_pairs.add((other_feature, feature)) # Experimental: does this help? feature_pair_corr = corr_matrix.at[feature, other_feature] corr_with_target_feature = target_correlations[feature] corr_with_target_other = target_correlations[other_feature] nan_count_feature = df[feature].isna().sum() if consider_nan else 0 zero_count_feature = (df[feature] == 0).sum() if consider_zero else 0 nan_count_other = df[other_feature].isna().sum() if consider_nan else 0 zero_count_other = (df[other_feature] == 0).sum() if consider_zero else 0 total_count_feature = nan_count_feature + zero_count_feature total_count_other = nan_count_other + zero_count_other # Evaluate if significant difference in counts max_count = max(total_count_feature, total_count_other) if max_count == 0: # Avoid division by zero sig_diff = False else: relative_difference = abs(total_count_feature - total_count_other) / max_count sig_diff = relative_difference > diff_threshold # Print decision process print(f"Evaluating pair: '{feature}' and '{other_feature}' ({round(feature_pair_corr, 2)}) - {len(kept_features)} kept features") print(f" - Correlation with target: {corr_with_target_feature:.{decimal}f}, {corr_with_target_other:.{decimal}f}") if consider_nan or consider_zero: print(f" - {consider_text} counts: {total_count_feature}, {total_count_other}") # Logic to decide which feature to keep if corr_with_target_feature > corr_with_target_other and total_count_feature <= total_count_other: print(f" - Keeping '{feature}' (higher correlation, lower or equal count)\n") if other_feature in kept_features: kept_features.remove(other_feature) changes_made = True else: print(f"{other_feature} already removed\n") elif corr_with_target_feature > corr_with_target_other and not sig_diff: print(f" - Keeping '{feature}' (higher correlation, no significant diff: {relative_difference:.{decimal}f} <= {diff_threshold:.{decimal}f})\n") if other_feature in kept_features: kept_features.remove(other_feature) changes_made = True else: print(f"{other_feature} already removed\n") elif corr_with_target_feature > corr_with_target_other and sig_diff: print(f" - Keeping '{other_feature}' (higher correlation, significant diff: {relative_difference:.{decimal}f} > {diff_threshold:.{decimal}f})\n") if feature in kept_features: kept_features.remove(feature) changes_made = True feature_removed = True else: print(f"{feature} already removed\n") elif corr_with_target_feature == corr_with_target_other and total_count_feature <= total_count_other: print(f" - Keeping '{feature}' (equal correlation, lower or equal count)\n") if other_feature in kept_features: kept_features.remove(other_feature) changes_made = True else: print(f"{other_feature} already removed\n") elif corr_with_target_feature == corr_with_target_other and total_count_feature > total_count_other: print(f" - Keeping '{other_feature}' (equal correlation, higher count)\n") if feature in kept_features: kept_features.remove(feature) changes_made = True feature_removed = True else: print(f"{feature} already removed\n") elif corr_with_target_feature < corr_with_target_other and total_count_feature == total_count_other: print(f" - Keeping '{other_feature}' (lower correlation, equal count)\n") if feature in kept_features: kept_features.remove(feature) changes_made = True feature_removed = True else: print(f"{feature} already removed\n") elif corr_with_target_feature < corr_with_target_other and not sig_diff: print(f" - Keeping '{other_feature}' (lower correlation, no significant diff: {relative_difference:.{decimal}f} <= {diff_threshold:.{decimal}f})\n") if feature in kept_features: kept_features.remove(feature) changes_made = True feature_removed = True else: print(f"{feature} already removed\n") else: print(f" - Keeping '{feature}' (otherwise, go with first feature)\n") if other_feature in kept_features: kept_features.remove(other_feature) changes_made = True else: print(f"{other_feature} already removed\n") if feature_removed: break # Break out of the first for loop if the feature was removed if not changes_made: # No more features to evaluate, end the loop break # Prepare final dataframe with kept features to_drop = original_features - kept_features reduced_df = df.drop(columns=to_drop) return reduced_df
[docs] def split_outliers( df: pd.DataFrame, columns: Optional[List[str]] = None, iqr_multiplier: float = 1.5 ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Split a DataFrame into two based on the presence of outliers. This function identifies outliers in the specified columns of the input DataFrame using the Interquartile Range (IQR) method. It then splits the DataFrame into two: one containing rows without outliers and another containing only the rows with outliers. Use this function when you need to separate outliers from the main data for further analysis or processing. Parameters ---------- df : pd.DataFrame The input DataFrame containing the data to be split. columns : List[str], optional List of column names to consider for outlier detection. If None, all columns in the DataFrame will be considered. Default is None. iqr_multiplier : float, optional The multiplier for the IQR range to determine outliers. Default is 1.5. Returns ------- Tuple[pd.DataFrame, pd.DataFrame] A tuple containing two DataFrames: The first DataFrame (df_no_outliers) contains rows without outliers. The second DataFrame (df_outliers) contains only the rows with outliers. Examples -------- Prepare the data for the examples: >>> df = pd.DataFrame({ ... 'A': [1, 2, 3, 4, 5, 100], ... 'B': [10, 20, 30, 40, 50, 600], ... 'C': [-30, 4, 3, 2, 1, 3] ... }) Example 1: Split outliers considering all columns: >>> df_no_outliers, df_outliers = split_outliers(df) >>> df_no_outliers A B C 1 2 20 4 2 3 30 3 3 4 40 2 4 5 50 1 >>> df_outliers A B C 0 1 10 -30 5 100 600 3 Example 2: Split outliers considering specific columns: >>> df_no_outliers, df_outliers = split_outliers(df, columns=['A', 'B']) >>> df_no_outliers A B C 0 1 10 -30 1 2 20 4 2 3 30 3 3 4 40 2 4 5 50 1 >>> df_outliers A B C 5 100 600 3 """ # If columns parameter is not provided, use all columns in the DataFrame if columns is None: columns = df.columns # Create an initial mask with all False values (meaning no outliers) outlier_mask = pd.Series(False, index=df.index) # For each specified column, update the outlier mask to mark outliers for col in columns: Q1 = df[col].quantile(0.25) Q3 = df[col].quantile(0.75) IQR = Q3 - Q1 # Update mask for outliers in the current column outlier_mask |= (df[col] < (Q1 - iqr_multiplier * IQR)) | \ (df[col] > (Q3 + iqr_multiplier * IQR)) # Use the mask to split the data df_no_outliers = df[~outlier_mask] df_outliers = df[outlier_mask] return df_no_outliers, df_outliers