Source code for graphpkg.static._general

"""
plotting utility

author : Nishant Baheti<nishantbaheti.it19@gmail.com>
"""

from typing import Callable, Union, Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns


[docs]def plot_distribution(x: np.ndarray, kde: Optional[bool] = True, indicate_data: Optional[Union[list, np.ndarray]] = None, figsize: Optional[tuple] = None) -> None: """ Plot distribution with additional informations. distribution and box plot from matplotlib and seaborn. Args: x (np.ndarray): input 1D array. kde (Optional[bool], optional): kde parameter from seaborn. Defaults to True. indicate_data (Optional[Union[list, np.ndarray]], optional): data points to observe/indicate in plot. Defaults to None. figsize (Optional[tuple], optional): figure size from matplotlib. Defaults to None. Raises: AssertionError : only 1d arrays are allowed for input. Examples: >>> import numpy as np >>> import matplotlib.pyplot as plt >>> from graphpkg.static import plot_distribution >>> x = np.random.normal(size=(200,)) >>> plot_distribution(x, indicate_data=[0.6]) >>> plt.show() """ x = np.array(x) if isinstance(x, (list, tuple)) else x assert len(x.shape) == 1, "only 1d arrays are allowed." min_value = x.min() max_value = x.max() mean_value = x.mean() std_value = x.std() median_value = np.median(x) mode_value = stats.mode(x) txt_summary = f""" Min : {min_value} Max : {max_value} Median : {median_value} Mode : {mode_value} Mean : {mean_value} Std dev : {std_value} """ print(txt_summary) _, _ax = plt.subplots(2, 1, figsize=figsize or (5, 5)) _ax[0].boxplot(x, vert=False) _ax[0].axvline(x=min_value, color='blue', lw=2, label='min') _ax[0].axvline(x=mean_value, color='k', lw=2, label='mean') _ax[0].axvline(x=median_value, color='red', lw=2, label='median') _ax[0].axvline(x=max_value, color='gray', lw=2, label='max') sns.histplot(x, kde=kde, label='distribution', ax=_ax[1], element='step') _ax[1].axvline(x=min_value, color='blue', lw=2, label='min') _ax[1].axvline(x=mean_value, color='k', lw=2, label='mean') _ax[1].axvline(x=median_value, color='red', lw=2, label='median') _ax[1].axvline(x=max_value, color='gray', lw=2, label='max') _ax[1].axvline(x=mean_value + std_value, color='gray', ls='--') _ax[1].axvline(x=mean_value - std_value, color='gray', ls='--') _ax[1].axvline(x=mean_value + (2 * std_value), color='gray', ls='--') _ax[1].axvline(x=mean_value - (2 * std_value), color='gray', ls='--') if indicate_data is not None: for ind_data in indicate_data: _ax[1].axvline(x=ind_data, color='k', alpha=0.4, lw=3) _ax[1].axvline(x=ind_data, color='k', lw=1, label=f"indicating {ind_data}") plt.tight_layout() plt.legend(loc='upper right')
def adjust_multiplots(n_plots: int, n_cols: int, figsize: Union[tuple, None]): """ Adjust multiple plots in matplotplot subplots. Args: n_plots (int): Number of plots. n_cols (int): Number of columns. figsize (Union[tuple, None]): figsize. Returns: matplolib figure, matplotlib subplot axes. Examples: >>> fig, ax = adjust_multiplots(n_plots=9, n_cols=3, figsize=(15,15)) """ n_cols = min(n_cols, n_plots) n_rows = int(np.ceil(n_plots / n_cols)) _fig, _ax = plt.subplots(ncols=n_cols, nrows=n_rows, figsize=figsize or (n_cols*3, n_rows*3)) if isinstance(_ax, (np.ndarray)): _ax = _ax.reshape((n_rows * n_cols)) else: _ax = np.array([_ax]) return _fig, _ax def create_mesh(size: int, pts_details: int) -> list: """ Create a mesh grid. Args: size (int): size/params of canvas. pts_details (int): detailed number of points. Returns: list: list of mesh variables. Generally referred to xx and yy. Examples: >>> xx, yy = create_mesh(size=4, pts_details=100) """ loc_points = np.linspace(-size, size, pts_details) return np.meshgrid(loc_points, loc_points) def create_canvas(size: int, pts_details: int) -> np.ndarray: """ Create a 2 Dimensional canvas. Args: size (int): size/params of canvas. pts_details (int): detailed number of points. Returns: np.ndarray: numpy array for canvas points. Examples: >>> all_points = create_canvas(size=4, pts_details=100) """ all_points = np.array(create_mesh(size, pts_details)).T.reshape(-1, 2) return all_points
[docs]def plot_classification_boundary(func: Callable, data: np.ndarray = None, size: int = 4, n_plot_cols: int = 1, figsize: tuple = (5, 5), canvas_details: int = 50, canvas_opacity: float = 0.5, canvas_palette: str = 'coolwarm'): """ Plot classification model's decision boundary. Args: func (function): Prediction function of ML model that. data (np.ndarray, optional): source data. restricted to 2 features and 1 target, \ in total 3 columns. Defaults to None. size (int, optional): size of canvas. Defaults to 4. n_plot_cols (int, optional): number of columns for number of plots. Defaults to 1. figsize (tuple, optional): matplotlib figure size. Defaults to (5, 5). canvas_details (int, optional): how detailed the boundary should be. Defaults to 50. canvas_opacity (float, optional): Canvas transparency parameter. Defaults to 0.3. canvas_palette (str, optional): palette of canvas. Defaults to 'coolwarm'. Raises: ValueError: If the input data's shape is not (k,3), k=number of rows. Examples: >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.datasets import make_classification >>> import matplotlib.pyplot as plt >>> X, y = make_classification(n_samples=500, n_features=2, random_state=25, >>> n_informative=1, n_classes=2, n_clusters_per_class=1, >>> n_repeated=0, n_redundant=0) >>> model = LogisticRegression().fit(X, y) >>> plot_classification_boundary(func=model.predict, data=np.hstack((X,y.reshape(-1,1))),canvas_details=100) >>> plt.show() """ if data is not None: if not (len(data.shape) == 2 and data.shape[1] == 3): raise ValueError( "Only shape (k,3) data is allowed. For flat plotting purposes") xx, yy = create_mesh(size=size, pts_details=canvas_details) all_points = np.c_[xx.ravel(), yy.ravel()] probs = func(all_points) probs = probs if len(probs.shape) >= 2 else probs.reshape(-1, 1) n_plots = probs.shape[1] n_plot_rows = int(np.ceil(n_plots / n_plot_cols)) fig, _ax = plt.subplots(nrows=n_plot_rows, ncols=n_plot_cols, figsize=figsize) _ax = _ax if isinstance(_ax, np.ndarray) else np.array([_ax]) grid = _ax if len(_ax.shape) == 1 else _ax.reshape(n_plot_rows*n_plot_cols,) plotted = 0 for ax_ele in grid: # type: ignore # sns.scatterplot(x=all_points[..., 0], y=all_points[..., 1], # hue=probs[..., plotted], palette=canvas_palette, ax=ax_ele, # alpha=canvas_opacity) zz = probs[:,plotted].reshape(xx.shape) ax_ele.contourf(xx, yy, zz, cmap=canvas_palette, alpha=canvas_opacity) if data is not None: sns.scatterplot(x=data[..., -3], y=data[..., -2], hue=data[..., -1], palette='dark', ax=ax_ele, legend=False) plotted += 1 if plotted == n_plots: break # fig.legend(bbox_to_anchor=(1.2, 1), loc='upper right') fig.tight_layout() return fig, _ax
[docs]def grid_classification_boundary(models_list: list, data: np.ndarray = None, size: int = 4, n_plot_cols: int = 3, figsize: tuple = (5, 5), canvas_details: int = 50, canvas_opacity: float = 0.4, canvas_palette='coolwarm') -> None: """ Plot multiple plots of clasification boundaries for mulitple ml models. Only models are allowed with 1D prediction. Args: models_list (list): Models list of dictionary. data (np.ndarray, optional): source data. restricted to 2 features and 1 target, in total 3 columns. Defaults to None. size (int, optional): Size of canvas. Defaults to 4. n_plot_cols (int, optional): number of plot columns. Defaults to 3. figsize (tuple, optional): figure size. Defaults to (5, 5). canvas_details (int, optional): detailing in canvas. Defaults to 50. canvas_opacity (float, optional): Canvas transparency parameter. Defaults to 0.4. canvas_palette (str, optional): palette from matplotlib. Defaults to coolwarm. Raises: ValueError: Only 3 dimensional data, 2 features, 1 target is allowed. Examples: >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.tree import DecisionTreeClassifier >>> from sklearn.datasets import make_classification >>> import matplotlib.pyplot as plt >>> X, y = make_classification(n_samples=500, n_features=2, random_state=25, >>> n_informative=1, n_classes=2, n_clusters_per_class=1, >>> n_repeated=0, n_redundant=0) >>> lr_model = LogisticRegression().fit(X, y) >>> dt_model = DecisionTreeClassifier().fit(X, y) >>> models_list = [{ >>> "name": "Logistic Regression Classifier", >>> "function": lr_model.predict >>> },{ >>> "name": "Decision Tree Classifier", >>> "function": dt_model.predict >>> }] >>> grid_classification_boundary(models_list=models_list, data=np.hstack((X, y.reshape(-1, 1))), >>> figsize=(7,5), canvas_details=100) >>> plt.show() """ if data is not None: if not (len(data.shape) == 2 and data.shape[1] == 3): raise ValueError( "Only shape (k,3) data is allowed. For flat plotting purposes") xx, yy = create_mesh(size=size, pts_details=canvas_details) all_points = np.c_[xx.ravel(), yy.ravel()] n_plots = len(models_list) _, _ax = adjust_multiplots(n_plots=n_plots, n_cols=n_plot_cols, figsize=figsize) for i_plot in range(n_plots): try: func = models_list[i_plot]["function"] probs = func(all_points) assert len(probs.shape) == 1 except AssertionError: print(f"{i_plot} number's model's output is not 1D") finally: probs = probs if len(probs.shape) >= 2 else probs.reshape(-1, 1) # sns.scatterplot(x=all_points[..., 0], y=all_points[..., 1], # hue=probs[..., 0], palette=canvas_palette, ax=_ax[i_plot], # alpha=canvas_opacity) zz = probs[:, 0].reshape(xx.shape) _ax[i_plot].contourf(xx, yy, zz, cmap=canvas_palette, alpha=canvas_opacity) if data is not None: sns.scatterplot(x=data[..., -3], y=data[..., -2], hue=data[..., -1], palette='dark', ax=_ax[i_plot], legend=False) # _ax[i_plot].legend(bbox_to_anchor=(1.2, 1), loc='upper right') _ax[i_plot].set_title(models_list[i_plot]["name"]) plt.tight_layout()
[docs]def multi_distplots(df: pd.DataFrame, n_cols: int = 4, bins: int = 20, kde: bool = True, class_col: str = None, legend: bool = True, legend_loc: str = 'best', figsize: tuple = None, palette: str = 'dark', grid_flag: bool = True, xticks_rotation: int = 60) -> None: """ Mulitple Distribution Plots using pandas dataframe. Seaborn's histplot is used for distribution with additional functionality to have multiple distributions in one grid. Args: df (pd.DataFrame): Input dataframe. n_cols (int, optional): Number of columns in the grid. Defaults to 4. bins (int, optional): number of bins in distribution. Defaults to 20. kde (bool, optional): kde estimation line & plot. Defaults to True. class_col (str, optional): class column name for distribution separation and legend. Defaults to None. legend (bool, optional): put legend or not. Defaults to True. legend_loc (str, optional): where to put legend, takes inputs similar to matplotlib.pyplot. Defaults to 'best'. figsize (tuple, optional): figure size, similar to matplotlib.pyplot. Defaults to None. palette (str, optional): color palette, property from seaborn. Defaults to 'dark'. grid_flag (bool, optional): put grid or not. Defaults to True. xticks_rotation (int, optional): xticks rotation angle. Defaults to 60. Examples: >>> from sklearn.datasets import fetch_california_housing >>> import pandas as pd >>> import numpy as np >>> dataset = fetch_california_housing() >>> df = pd.DataFrame(dataset.data, columns=dataset.feature_names) >>> df['target'] = dataset.target >>> multi_distplots(df, n_cols=2) >>> plt.show() """ columns = df.columns n_labels = len(columns) _, _ax = adjust_multiplots(n_plots=n_labels, n_cols=n_cols, figsize=figsize) for idx, name in enumerate(columns): sns.histplot(data=df, x=name, hue=class_col, bins=bins, label=name, ax=_ax[idx], legend=legend, palette=palette, kde=kde) if str(df[name].dtype) == 'object': _ax[idx].tick_params(labelrotation=xticks_rotation) _ax[idx].grid(grid_flag) if legend: plt.legend(loc=legend_loc) plt.tight_layout()
if __name__ == "__main__": # x = np.random.normal(size=(200,)) # plot_distribution(x, indicate_data=[0.6]) # plt.show() # from sklearn.linear_model import LogisticRegression # from sklearn.datasets import make_classification # X, y = make_classification(n_samples=500, n_features=2, random_state=25, # n_informative=2, n_classes=3, n_clusters_per_class=1, # n_repeated=0, n_redundant=0) # model = LogisticRegression().fit(X, y) # fig, ax = plot_classification_boundary(func=model.predict, \ # data=np.hstack((X,y.reshape(-1,1))),canvas_details=100) # plt.show() # fig, ax = plot_classification_boundary(func=model.predict_proba, # data=np.hstack((X, y.reshape(-1, 1))), canvas_details=100) # plt.show() # from sklearn.linear_model import LogisticRegression # from sklearn.tree import DecisionTreeClassifier # from sklearn.datasets import make_classification # X, y = make_classification(n_samples=500, n_features=2, random_state=25, # n_informative=1, n_classes=2, n_clusters_per_class=1, # n_repeated=0, n_redundant=0) # lr_model = LogisticRegression().fit(X, y) # dt_model = DecisionTreeClassifier().fit(X, y) # models_list = [{ # "name": "Logistic Regression Classifier", # "function": lr_model.predict # },{ # "name": "Decision Tree Classifier", # "function": dt_model.predict # }] # grid_classification_boundary(models_list=models_list, data=np.hstack((X, y.reshape(-1, 1))), # figsize=(7,5), canvas_details=100) # plt.show() # from sklearn.datasets import fetch_california_housing # dataset = fetch_california_housing() # df = pd.DataFrame(dataset.data, columns=dataset.feature_names) # df['target'] = dataset.target # multi_distplots(df, n_cols=2) # plt.show() pass