Source code for lrasm.homoscedasticity_tst

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from matplotlib.pyplot import figure
from scipy import stats

[docs]def homoscedasticity_test(X, y, threshold = 0.05):
    """This function recieves a linear regression model and outputs a
    scatter plot figure of residuals plotted against fitted values. It prints
    a statement indicating the results of the homoscedasticity test and outputs
    a dataframe containing spearman correlation coefficients between the 
    absolute residuals and the fitted y values.

    Parameters
    ----------
    X : pd.Dataframe
        Dataframe containing exploratory variable data

    y : pd.series
        Dataframe containing response variable data
        
    Returns
    -------
    pd.DataFrame
        Scatter plot of residuals plotted against fitted values

    Examples
    --------
    >>> homoscedasticity_test(X_train, y_train) 
    >>> 
    >>> correlation_coefficient	p_value
    >>>                   0.038	  0.427
    """

    if not isinstance(X, pd.DataFrame):
        print("Error: X must be a dataframe")
        return None, None

    if not isinstance(y, pd.Series):
        print("Error: y must be a series")
        return None, None

    if not X.shape[1] == X.select_dtypes(include=np.number).shape[1]:
        print("Error: X must only contain numeric data.")
        return None, None

    if not pd.api.types.is_numeric_dtype(y):
        print("Error: y must only contain numeric data.")
        return None, None

    lr = linear_model.LinearRegression()
    lr.fit(X, y)
    preds = lr.predict(X)
    comp_df = pd.DataFrame({"Real" : y, "Predicted": preds, "residuals": y-preds})
    comp_df["abs_res"] = abs(comp_df.residuals)

    plot = figure(figsize=(10, 6), dpi=80)
    plt.scatter(x=comp_df["Predicted"], y=comp_df["residuals"], alpha = 0.5)
    plt.axhline(y = 0.0, color = 'r', linestyle = '--')
    plt.xlabel("Fitted Target Values")
    plt.ylabel("Residuals")
    plt.title("Plot of Residuals vs Fitted Values")
    
    y2 = comp_df.abs_res
    X2 = comp_df["Predicted"]
    corr = round(abs(stats.spearmanr(X2, y2).correlation), 3)
    pval = round(abs(stats.spearmanr(X2, y2).pvalue),3)
    corr_df = pd.DataFrame({"correlation_coefficient": [corr], "p_value" : [pval]})
    
    print("The correlation coefficient between the absolute residuals and the fitted y values is: ", str(corr), " With a p value of: ", str(pval))

    if pval > threshold:
        print("The p value of the correlation is above the rejection threshold, thus the correlation is likely not significant. \
        \nThe data is likely to be homoscedastic if the cluster of points has similar width throughout the X axis on the residuals plot.")
    else:
        print("The p value of the correlation is below the rejection threshold, thus the correlation is likely significant. \
        \nThe data is unlikely to be homoscedastic.")

    return corr_df, plot