Source code for lrasm.normality_tst

import numpy as np
import pandas as pd
from sklearn import linear_model
from scipy import stats

[docs]def normality_test(X, y, p_value_thresh = 0.05): """This function recieves a linear regression model and p-value threshold and outputs the p-value from a shapiro wilks test along with a statement indicating the results of the normality test Parameters ---------- X : pd.Dataframe Dataframe containing exploratory variable data y : pd.Series Dataframe containing response variable data p_value_thresh: float The threshold user defines for the p-value, default set to 0.05 Returns ------- float p-value of the shapiro wilk test Examples -------- >>> normality_test(X_train, y_train, p_value_thresh = 0.05). """ if not isinstance(X, pd.DataFrame): raise TypeError("Error: X must be a dataframe") if not isinstance(y, pd.Series): raise TypeError("Error: y must be a series") if not X.shape[1] == X.select_dtypes(include=np.number).shape[1]: raise TypeError("Error: X must only contain numeric data.") if not pd.api.types.is_numeric_dtype(y): raise TypeError("Error: y must only contain numeric data.") if not X.shape[0] == len(y): raise ValueError("Error: x and y must have the same number of data points") lr = linear_model.LinearRegression() lr.fit(X, y) preds = lr.predict(X) residuals = y-preds shapiro_test = stats.shapiro(residuals) p_value = shapiro_test.pvalue res = "Pass" if p_value >= p_value_thresh: print("After applying the Shapiro Wilks test for normality of the residuals the regression assumption of normality has passed and you can continue with your analysis") else: print("After applying the Shapiro Wilks test for normality of the residuals the regression assumption of normality has failed and you should make some djustments before continuing with your analysis") res = "Fail" return (p_value,res)