# tools.py – Tools module of Datawaza
#
# Datawaza Copyright (C) 2024 Jim Beno
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details:
# https://github.com/jbeno/datawaza/blob/main/LICENSE
"""
This module provides helper tools used in data analysis, cleaning, and modeling.
It contains functions to check for duplicates in lists, split a dataframe into two
by numeric vs. categorical variables, format numbers on the axis of a chart,
perform log transformations, calculate VIF and Feature Permutation Importance,
and extract the coefficients from models that support them.
Classes:
- :class:`~datawaza.tools.DebugPrinter` - Conditionally print debugging information during the execution of a script.
- :meth:`~datawaza.tools.DebugPrinter.print` - Print a message if debugging is enabled.
- :meth:`~datawaza.tools.DebugPrinter.set_debug` - Enable or disable debugging mode.
- :class:`~datawaza.tools.LogTransformer` - Apply logarithmic transformation to numerical features.
- :meth:`~datawaza.tools.LogTransformer.fit` - Fit the transformer to the input data.
- :meth:`~datawaza.tools.LogTransformer.transform` - Apply the logarithmic transformation to the input data.
- :meth:`~datawaza.tools.LogTransformer.get_feature_names_out` - Get the feature names after applying the transformation.
Functions:
- :func:`~datawaza.tools.calc_pfi` - Calculate Permutation Feature Importance for a trained model.
- :func:`~datawaza.tools.calc_vif` - Calculate the Variance Inflation Factor (VIF) for each feature.
- :func:`~datawaza.tools.check_for_duplicates` - Check for duplicate items (ex: column names) across multiple lists.
- :func:`~datawaza.tools.extract_coef` - Extract feature names and coefficients from a trained model.
- :func:`~datawaza.tools.format_df` - Format columns of a DataFrame as either large or small numbers.
- :func:`~datawaza.tools.log_transform` - Apply a log transformation to specified columns in a DataFrame.
- :func:`~datawaza.tools.model_summary` - Create a DataFrame summary of a Keras model's architecture and parameters.
- :func:`~datawaza.tools.split_dataframe` - Split a DataFrame into categorical and numerical columns.
- :func:`~datawaza.tools.thousand_dollars` - Format a number as currency with thousands separators on a matplotlib chart axis.
- :func:`~datawaza.tools.thousands` - Format a number with thousands separators on a matplotlib chart axis.
"""
# Metadata
__author__ = "Jim Beno"
__email__ = "jim@jimbeno.net"
__version__ = "0.1.3"
__license__ = "GNU GPLv3"
# Standard library imports
import os
import inspect
# Data manipulation and analysis
import numpy as np
import pandas as pd
# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
# Machine Learning: Model selection and evaluation
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance
# Machine Learning: Pipeline and transformations
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
# Machine Learning: Models
from sklearn.linear_model import (
LogisticRegression, LogisticRegressionCV, PassiveAggressiveClassifier,
Perceptron, RidgeClassifier, RidgeClassifierCV, SGDClassifier, SGDOneClassSVM,
LinearRegression, Ridge, RidgeCV, SGDRegressor, ElasticNet, ElasticNetCV,
Lars, LarsCV, Lasso, LassoCV, LassoLars, LassoLarsCV, LassoLarsIC,
OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV, ARDRegression,
BayesianRidge, HuberRegressor, QuantileRegressor, RANSACRegressor,
TheilSenRegressor
)
# Typing imports
from typing import Optional, Union, Tuple, List, Dict, Any
# TensorFlow and Keras
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TensorFlow warning on import
import tensorflow as tf
import keras as keras
# Functions
[docs]
def calc_pfi(model,
X: pd.DataFrame,
y: pd.Series,
scoring: Any = None,
n_repeats: int = 10,
random_state: int = 42,
decimal: int = 2
) -> pd.DataFrame:
"""
Calculate Permutation Feature Importance for a trained model.
This function calculates the Permutation Feature Importance (PFI) for
each feature in the input dataset using a trained model. PFI measures
the importance of each feature by permuting its values and observing
the impact on the model's performance. Features with higher
permutation importance scores are considered more important for the
model's predictions.
The function returns a DataFrame with the feature names, mean
permutation importance scores, and standard deviations of the scores.
The DataFrame is sorted in descending order based on the mean scores.
It's just a wrapper around the Scikit-learn `permutation_importance`
function to display the results in a convenient format.
Use this function to identify the most important features for a
trained model and gain insights into the model's behavior.
Parameters
----------
model :
The trained model object. It should have a `predict` method.
X : pd.DataFrame
The input DataFrame containing the features used for prediction.
y : pd.Series
The target variable or labels corresponding to the input features.
scoring : Any, optional
Scorer to use. It can be a single string (see sklearn 'scoring_parameter') or
a callable that returns a single value. Default is None, which uses the
estimator's default scorer.
n_repeats : int, optional
The number of times to permute each feature. Higher values provide
more stable importance scores but increase computation time.
Default is 10.
random_state : int, optional
The random seed for reproducibility. Default is 42.
decimal : int, optional
The number of decimals to round to when displaying output.
Default is 2.
Returns
-------
pd.DataFrame
A DataFrame with three columns: 'Feature' (feature names),
'Importance Mean' (mean permutation importance scores), and
'Importance Std' (standard deviations of the scores). The DataFrame is
sorted in descending order based on the 'Importance Mean' column.
Examples
--------
Prepare a sample dataset and train a model:
>>> from sklearn.datasets import load_iris
>>> from sklearn.ensemble import RandomForestClassifier
>>> iris = load_iris()
>>> X = pd.DataFrame(iris.data, columns=iris.feature_names)
>>> y = pd.Series(iris.target)
>>> model = RandomForestClassifier(random_state=42)
>>> model.fit(X, y)
RandomForestClassifier(random_state=42)
Calculate Permutation Feature Importance:
>>> pfi_df = calc_pfi(model, X, y, decimal=4)
>>> pfi_df
Feature Importance Mean Importance Std
2 petal length (cm) 0.2227 0.0243
3 petal width (cm) 0.1807 0.0212
0 sepal length (cm) 0.0147 0.0065
1 sepal width (cm) 0.0127 0.0047
"""
# Calculate Permutation Feature Importance
r = permutation_importance(model, X, y, n_repeats=n_repeats, scoring=scoring,
random_state=random_state)
# Create a DataFrame with feature names, mean scores, and std scores
pfi_df = pd.DataFrame({"Feature": X.columns,
"Importance Mean": r.importances_mean,
"Importance Std": r.importances_std})
# Sort the DataFrame by mean scores in descending order
pfi_sorted = pfi_df.sort_values(by="Importance Mean", ascending=False)
# Format the PFI values for better readability
pfi_formatted = format_df(pfi_sorted, small_num_cols=['Importance Mean', 'Importance Std'], decimal=decimal)
return pfi_formatted
[docs]
def calc_vif(X: pd.DataFrame,
num_columns: Optional[List[str]] = None,
decimal: int = 2
) -> pd.DataFrame:
"""
Calculate the Variance Inflation Factor (VIF) for each feature.
This function calculates the VIF for each feature in the input
dataset. VIF is a measure of multicollinearity, which indicates the
degree to which a feature can be explained by other features in the
dataset. A higher VIF value suggests higher multicollinearity, and a
VIF value exceeding 5 or 10 is often regarded as indicating severe
multicollinearity.
By default, VIF will be calculated for all numeric columns in the `X`
DataFrame. You can optionally specify columns with `num_columns`. You
can also control how many decimal places are shown with `decimal`.
The function also interprets the level of multicollinearity based on
the VIF values and assigns a corresponding category: "Extreme" (VIF
>= 100), "High" (10 <= VIF < 100), "Moderate" (5 <= VIF < 10), or
"Low" (VIF < 5).
Use this function to identify features with high multicollinearity in
your dataset before performing further analysis or modeling.
Parameters
----------
X : pd.DataFrame
The input DataFrame containing the features to calculate VIF for.
num_columns : List[str], optional
List of column names to consider for VIF calculation. If
provided, only the specified numeric columns will be used. If
None (default), all numeric columns in the DataFrame will be
used.
decimal : int, optional
The number of decimals to round to when displaying output.
Default is 2.
Returns
-------
pd.DataFrame
A DataFrame with three columns: 'Features' (feature names), 'VIF'
(VIF values), and 'Multicollinearity' (interpreted level of
multicollinearity). The DataFrame is sorted in descending order
based on the VIF values.
Examples
--------
Prepare a sample dataset for the examples:
>>> from sklearn.datasets import load_iris
>>> iris = load_iris()
>>> X = pd.DataFrame(iris.data, columns=iris.feature_names)
>>> num_columns = list(X.columns)
Example 1: Calculate VIF for all numeric features in the iris dataset:
>>> vif_df = calc_vif(X)
>>> vif_df
Features VIF Multicollinearity
2 petal length (cm) 31.26 High
3 petal width (cm) 16.09 High
0 sepal length (cm) 7.07 Moderate
1 sepal width (cm) 2.10 Low
Example 2: Calculate VIF for specific numeric features, 4 decimals:
>>> vif_df = calc_vif(X, num_columns=num_columns, decimal=4)
>>> vif_df
Features VIF Multicollinearity
2 petal length (cm) 31.2615 High
3 petal width (cm) 16.0902 High
0 sepal length (cm) 7.0727 Moderate
1 sepal width (cm) 2.1009 Low
"""
from sklearn.linear_model import LinearRegression
def interpret_vif(vif):
if vif >= 100:
return "Extreme"
elif vif >= 10:
return "High"
elif vif >= 5:
return "Moderate"
else:
return "Low"
# Set a high threshold for very large VIFs
MAX_VIF = 1000
# If num_columns is not provided, select all numeric columns
if num_columns is None:
num_columns = X.select_dtypes(include=[np.number]).columns
vif_dict = {}
for feature in num_columns:
other_features = [col for col in num_columns if col != feature]
# Split the dataset, one independent variable against all others
X_other, y = X[other_features], X[feature]
# Fit the model and obtain R^2
r_squared = LinearRegression().fit(X_other, y).score(X_other, y)
# Compute the VIF, with a check for r_squared close to 1
if 1 - r_squared < 1e-5:
vif = MAX_VIF
else:
vif = 1 / (1 - r_squared)
vif_dict[feature] = vif
# Create a DataFrame with VIF values
vif_df = pd.DataFrame({"Features": vif_dict.keys(), "VIF": vif_dict.values()})
# Flag severe multicollinearity
vif_df["Multicollinearity"] = vif_df["VIF"].apply(interpret_vif)
# Sort the DataFrame by VIF values in descending order
vif_sorted = vif_df.sort_values(by='VIF', ascending=False)
# Format the VIF values for better readability
vif_formatted = format_df(vif_sorted, small_num_cols=['VIF'], decimal=decimal)
return vif_formatted
[docs]
def check_for_duplicates(*lists: List[str],
df: Optional[pd.DataFrame] = None) -> None:
"""
Check for duplicate items (ex: column names) across multiple lists.
This function takes an arbitrary number of lists and checks for duplicate items
across the lists, as well as items appearing more than once within each list.
It prints a summary of the items and the lists they appear in. Additionally, if
a DataFrame is provided, it checks for any columns in the DataFrame that are
missing from the lists and prints them.
Use this function when you are organizing columns in a large DataFrame into
lists that represent their variable type (ex: num_columns, cat_columns). This
helps to ensure you haven't duplicated a column accidentally. And the optional
DataFrame check helps you identify columns that haven't been assigned to a list
yet. This is really useful when you're dealing with a large dataset.
Parameters
----------
*lists : List[str]
An arbitrary number of lists containing items (ex: column names) to check
for duplicates.
df : pd.DataFrame, optional
A DataFrame to check for missing columns that are not present in the lists.
Default is None.
Returns
-------
None
The function prints the duplicate items, the lists they appear in, and any
missing columns in the DataFrame (if provided).
Examples
--------
Prepare data for examples, with intentional duplicates:
>>> df = pd.DataFrame({'age': [], 'height': [], 'weight': [], 'gender': [],
... 'city': [], 'country': []})
>>> num_cols = ['age', 'height', 'weight']
>>> cat_cols = ['gender', 'age', 'country', 'country']
Example 1: Check for duplicate column names in two lists:
>>> check_for_duplicates(num_cols, cat_cols)
Items appearing in more than one list, or more than once per list:
age (2): num_cols, cat_cols
country (2): cat_cols, cat_cols
Fix the duplicate column:
>>> cat_cols = ['gender', 'country']
Example 2: Check for duplicates, and look for missing columns in a DataFrame:
>>> check_for_duplicates(num_cols, cat_cols, df=df)
Items appearing in more than one list, or more than once per list:
None.
<BLANKLINE>
Columns in the dataframe missing from the lists:
city
Fix the missing column:
>>> cat_cols = ['gender', 'city', 'country']
Final check:
>>> check_for_duplicates(num_cols, cat_cols, df=df)
Items appearing in more than one list, or more than once per list:
None.
<BLANKLINE>
Columns in the dataframe missing from the lists:
None.
"""
# Get the frame and local variables of the caller
caller_frame = inspect.currentframe().f_back
caller_locals = caller_frame.f_locals
# Create a dictionary to store the mapping of columns to the lists they appear in
column_lists_map = {}
# Iterate over each list passed as an argument
for lst in lists:
# Get the name of the list variable from the caller's local variables
list_name = [name for name, value in caller_locals.items() if value is lst][0]
# Iterate over each column in the current list
for column in lst:
if column not in column_lists_map:
# If the column is not in the map, add it with the current list name
column_lists_map[column] = [list_name]
else:
# Append the current list name, even if it exists, to check
# for duplicated items or column names within the same list
column_lists_map[column].append(list_name)
# Create a dictionary of duplicate columns and the lists they appear in
duplicates = {column: lists for column, lists in column_lists_map.items() if len(lists) > 1}
# Print the summary of duplicate columns
print("Items appearing in more than one list, or more than once per list:")
if duplicates:
for column, lists in duplicates.items():
print(f"{column} ({len(lists)}): {', '.join(lists)}")
else:
print("None.")
# If a DataFrame is passed, check for column names that are missing from the lists
if df is not None:
all_columns = column_lists_map.keys()
missing_columns = set(df.columns) - set(all_columns)
print("\nColumns in the dataframe missing from the lists:")
if missing_columns:
for column in missing_columns:
print(column)
else:
print("None.")
[docs]
def model_summary(
model: keras.Model
) -> pd.DataFrame:
"""
Create a DataFrame summary of a Keras model's architecture and parameters.
This function takes a Keras model as input and returns a pandas DataFrame
containing a summary of the model's architecture, including the model name,
type, total parameters, trainable parameters, non-trainable parameters, layer
names, types, activations, output shapes, the number of parameters, and the
parameter sizes in bytes for each layer.
Use this function when you need to obtain a structured summary of a Keras
model's architecture and parameters for analysis, reporting, or
visualization purposes. This is also used to test some other functions
where the model.summary() output varies enough to fail the test cases.
Parameters
----------
model : keras.Model
The Keras model for which to generate the summary.
Returns
-------
pd.DataFrame
A pandas DataFrame containing the model summary, with columns for layer
name, type, activation, output shape, number of parameters, and parameter
size in bytes. Additional rows are included to show the total, trainable,
and non-trainable parameters along with their byte sizes.
Examples
--------
>>> pd.set_option('display.max_columns', None) # For test consistency
>>> pd.set_option('display.width', None) # For test consistency
>>> model = keras.Sequential([
... keras.layers.Input(shape=(10,), name='Input'),
... keras.layers.Dense(64, activation='relu', name='Dense_1'),
... keras.layers.Dense(32, activation='relu', name='Dense_2'),
... keras.layers.Dense(1, activation='sigmoid', name='Dense_3'),
... ], name='Sequential_Model')
>>> model.build()
>>> model_summary(model) #doctest: +NORMALIZE_WHITESPACE
Item Name Type Activation Output Shape Parameters Bytes
0 Model Sequential_Model Sequential None None NaN NaN
1 Input Input KerasTensor None (None, 10) 0.0 0.0
2 Layer Dense_1 Dense relu (None, 64) 704.0 2816.0
3 Layer Dense_2 Dense relu (None, 32) 2080.0 8320.0
4 Layer Dense_3 Dense sigmoid (None, 1) 33.0 132.0
5 Statistic Total Params None None None 2817.0 11268.0
6 Statistic Trainable Params None None None 2817.0 11268.0
7 Statistic Non-Trainable Params None None None 0.0 0.0
"""
if not model.built:
print("Model is not built. Please build the model by calling `model.build(input_shape)` or by running `model.fit()` with some data.")
return pd.DataFrame() # Return an empty DataFrame if the model is not built
def format_size(num_params):
return num_params * 4 # Assuming parameters are float32, each taking 4 bytes
layers_summary = []
# Model row
layers_summary.append(["Model", model.name, model.__class__.__name__, None, None, None, None])
# Input layer(s)
for input_tensor in model.inputs:
layers_summary.append([
"Input", input_tensor.name.split(':')[0], input_tensor.__class__.__name__,
None, str(input_tensor.shape), 0, 0
])
# Layers
for layer in model.layers:
activation = getattr(layer, 'activation', None)
activation_name = activation.__name__ if activation else None
try:
output_shape = str(layer.output.shape)
except AttributeError:
output_shape = 'Unavailable'
layers_summary.append([
"Layer", layer.name, layer.__class__.__name__, activation_name,
output_shape, layer.count_params(), format_size(layer.count_params())
])
# Statistics
total_params = model.count_params()
trainable_params = sum(tf.size(w).numpy() for w in model.trainable_variables)
non_trainable_params = total_params - trainable_params
layers_summary.append(["Statistic", "Total Params", None, None, None, total_params, format_size(total_params)])
layers_summary.append(["Statistic", "Trainable Params", None, None, None, trainable_params, format_size(trainable_params)])
layers_summary.append(["Statistic", "Non-Trainable Params", None, None, None, non_trainable_params, format_size(non_trainable_params)])
summary_df = pd.DataFrame(layers_summary, columns=["Item", "Name", "Type", "Activation", "Output Shape", "Parameters", "Bytes"])
return summary_df
[docs]
def split_dataframe(
df: pd.DataFrame,
n: int
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Split a DataFrame into categorical and numerical columns.
This function splits the input DataFrame into two separate DataFrames based on
the number of unique values in each column. Columns with `n` or fewer unique
values are considered categorical and are placed in `df_cat`, while columns
with more than `n` unique values are considered numerical and are placed in
`df_num`.
Use this function when you need to separate categorical and numerical columns
in a DataFrame for further analysis or processing.
Parameters
----------
df : pd.DataFrame
The DataFrame to split.
n : int
The maximum number of unique values for a column to be considered
categorical.
Returns
-------
Tuple[pd.DataFrame, pd.DataFrame]
A tuple containing two DataFrames:
- df_cat: Contains the categorical columns of `df`.
- df_num: Contains the numerical columns of `df`.
Examples
--------
Prepare the data for the examples:
>>> data = {
... 'A': [5.1, 2.0, 3.2, 1.4, 7.2],
... 'B': ['Yes', 'No', 'No', 'Yes', 'No'],
... 'C': [10, 20, 30, 40, 50],
... 'D': ['High', 'Low', 'High', 'Low', 'Low']
... }
>>> df = pd.DataFrame(data)
Example 1: Split the DataFrame based on 2 unique values:
>>> df_cat, df_num = split_dataframe(df, n=2)
>>> df_cat
B D
0 Yes High
1 No Low
2 No High
3 Yes Low
4 No Low
>>> df_num
A C
0 5.1 10
1 2.0 20
2 3.2 30
3 1.4 40
4 7.2 50
"""
# Initialize the 2 dataframes
df_cat = pd.DataFrame()
df_num = pd.DataFrame()
# Check unique values of each column
for col in df.columns:
# If Less than or equal to n, add it to the categorical df
if df[col].nunique() <= n:
df_cat[col] = df[col]
# Otherwise add it to the numerical df
else:
df_num[col] = df[col]
# Return the 2 dataframes
return df_cat, df_num
[docs]
def dollars(
x: float,
pos: int = 0
) -> str:
"""
Format a number as currency with thousands separators on a matplotlib chart
axis.
This function takes a numeric value `x` and formats it as a string with
thousands separators and a dollar sign prefix. The `pos` parameter is required
by the matplotlib library for tick formatting but is not used in this function.
Use this function when you need to display currency values in a more readable
format, particularly in the context of matplotlib or seaborn plots.
Parameters
----------
x : float
The number to format.
pos : int, optional
The position of the number. This parameter is not used in the function
but is required by matplotlib for tick formatting. Default is 0.
Returns
-------
str
The formatted number as a string with thousands separators and dollar sign.
Examples
--------
Example 1: Format a large currency value with default parameters:
>>> x = 1234567.89
>>> formatted_num = dollars(x)
>>> print(formatted_num)
$1,234,567
Example 2: Use the function for tick formatting in a seaborn scatterplot:
>>> import pandas as pd
>>> import seaborn as sns
>>> import matplotlib.pyplot as plt
>>> from matplotlib.ticker import FuncFormatter
>>> # Create a sample DataFrame for plotting
>>> data = {
... 'housing_median_age': [41.0, 21.0, 52.0, 52.0, 52.0, 52.0, 52.0],
... 'total_rooms': [880.0, 7099.0, 1467.0, 1274.0, 1627.0, 919.0, 2535.0],
... 'median_house_value': [452600.0, 358500.0, 352100.0, 341300.0,
... 342200.0, 269700.0, 299200.0]
... }
>>> df = pd.DataFrame(data)
>>> plt.figure(figsize=(10, 6)) # doctest: +SKIP
>>> plt.title('Total Rooms vs. Median House Value', fontsize=18, pad=15) # doctest: +SKIP
>>> sns.scatterplot(data=df, x='total_rooms', y='median_house_value') # doctest: +SKIP
>>> plt.xlabel('Total Rooms', fontsize=14, labelpad=10) # doctest: +SKIP
>>> plt.ylabel('Median House Value', fontsize=14) # doctest: +SKIP
>>> plt.gca().yaxis.set_major_formatter(FuncFormatter(dollars))
>>> plt.show() # Displays the plot (visual output not shown) # doctest: +SKIP
"""
s = '${:0,d}'.format(int(x))
return s
[docs]
def thousands(
x: float,
pos: int = 0
) -> str:
"""
Format a number with thousands separators on a matplotlib chart axis.
This function takes a numeric value `x` and formats it as a string with
thousands separators. The `pos` parameter is required by the matplotlib
library for tick formatting but is not used in this function.
Use this function when you need to display large numbers in a more readable
format, particularly in the context of matplotlib or seaborn plots.
Parameters
----------
x : float
The number to format.
pos : int, optional
The position of the number. This parameter is not used in the function
but is required by matplotlib for tick formatting. Default is 0.
Returns
-------
str
The formatted number as a string with thousands separators.
Examples
--------
Example 1: Format a large number with default parameters:
>>> x = 1234567.89
>>> formatted_num = thousands(x)
>>> print(formatted_num)
1,234,567
Example 2: Use the function for tick formatting in a seaborn histogram plot:
>>> import pandas as pd
>>> import seaborn as sns
>>> import matplotlib.pyplot as plt
>>> from matplotlib.ticker import FuncFormatter
>>> # Create a sample DataFrame for plotting
>>> data = {
... 'housing_median_age': [41.0, 21.0, 52.0, 52.0, 52.0, 52.0, 52.0],
... 'total_rooms': [880.0, 7099.0, 1467.0, 1274.0, 1627.0, 919.0, 2535.0],
... 'median_house_value': [452600.0, 358500.0, 352100.0, 341300.0,
... 342200.0, 269700.0, 299200.0]
... }
>>> df = pd.DataFrame(data)
>>> plt.figure(figsize=(10, 6)) # doctest: +SKIP
>>> plt.title('Total Rooms vs. Median House Value', fontsize=18, pad=15) # doctest: +SKIP
>>> sns.scatterplot(data=df, x='total_rooms', y='median_house_value') # doctest: +SKIP
>>> plt.xlabel('Total Rooms', fontsize=14, labelpad=10) # doctest: +SKIP
>>> plt.ylabel('Median House Value', fontsize=14) # doctest: +SKIP
>>> plt.gca().xaxis.set_major_formatter(FuncFormatter(thousands))
>>> plt.show() # Displays the plot (visual output not shown) # doctest: +SKIP
"""
s = '{:0,d}'.format(int(x))
return s
# Classes
[docs]
class DebugPrinter:
"""
Conditionally print debugging information during the execution of a script.
This class provides a simple way to print debugging information during the
execution of a script. By setting the `debug` attribute to True, you can
enable or disable debugging output throughout the script. The `print()`
method works like the built-in `print()` function but only prints output
when debugging is enabled.
Use this class when you need to easily control and print debugging messages
in your script, allowing you to enable or disable debugging output as
needed. It allows you to avoid nesting a bunch of print statements
underneath an "if debug:" statement, and it's lighter weight than a full
logging setup.
Parameters
----------
debug : bool, optional
Whether to enable debugging output. Default is False.
Examples
--------
Set some test variables for the examples:
>>> name = 'Setting'
>>> value = 10
Example 1: Create a DebugPrinter object and print a debug message:
>>> db = DebugPrinter(debug=True)
>>> db.print('This is a debug message.')
This is a debug message.
Example 2: Disable debugging and print a message that doesn't display:
>>> db.set_debug(False)
>>> db.print("This is a debug message that won't show.")
Example 3: Re-enable debug, and print a formatted message with variables:
>>> db.set_debug(True)
>>> db.print(f'This is a debug message. ({name}: {value})')
This is a debug message. (Setting: 10)
"""
def __init__(
self,
debug: bool = False
):
"""
Initialize the DebugPrinter object with the specified debugging setting.
"""
self.debug = debug
[docs]
def print(self, *args, **kwargs):
"""
Print debugging information if debugging is enabled.
Parameters
----------
*args
Any number of positional arguments to print.
**kwargs
Any keyword arguments to pass to the built-in `print()` function.
"""
if self.debug:
print(*args, **kwargs)
[docs]
def set_debug(self, debug: bool):
"""
Set the debugging setting to enable or disable debugging output.
Parameters
----------
debug : bool
Whether to enable or disable debugging output.
"""
self.debug = debug