Source code for fsds_100719.ds.regression_project

"""A Collection of functions from ft study group for section 25."""
if __name__=='__main__':
    import matplotlib.pyplot as plt
    import numpy as np
    import statsmodels.api as sms
    import statsmodels.formula.api as smf

[docs]def make_ols_f(df,target='price',cat_cols = [], col_list=None, show_summary=True,exclude_cols=[]): """ Uses the formula api of Statsmodels for ordinary least squares regression. Args: df (Frame): data target (str, optional): Column to predict. Defaults to 'price'. cat_cols (list, optional): Columns to treat as categorical (and one-hot). col_list ([type], optional): List of columns to use. Defaults to all columns besides exclude_cols. show_summary (bool, optional): Display the model.summary() before returning model. Defaults to True. exclude_cols (list, optional): List of column names to exclude. Defaults to []. - Note: if a column name doesn't appear in the dataframe, there will be no error nor warning message. Returns: model: The fit statsmodels OLS model """ import statsmodels.api as sms import statsmodels.formula.api as smf from IPython.display import display if col_list is None: col_list = list(df.drop(target,axis=1).columns) ## remove exclude cols [col_list.remove(ecol) for ecol in exclude_cols if ecol in col_list] features = '+'.join(col_list) for col in cat_cols: features = features.replace(col,f"C({col})") formula = target+'~'+features #target~predictors model = smf.ols(formula=formula, data=df).fit() if show_summary: display(model.summary()) return model
## diagnostic function
[docs]def diagnose_model(model): """ Displays the QQplot and residuals of the model. Args: model (statsmodels ols): A fit statsmodels ols model. Returns: fig (Figure): Figure object for output figure ax (list): List of axes for subplots. """ import matplotlib.pyplot as plt import statsmodels.api as sms import statsmodels.formula.api as smf import scipy.stats as stats resids = model.resid fig,ax = plt.subplots(ncols=2,figsize=(10,5)) sms.qqplot(resids, stats.distributions.norm, fit=True, line='45',ax=ax[0]) xs = np.linspace(0,1,len(resids)) ax[1].scatter(x=xs,y=resids) return fig,ax
[docs]def find_outliers_Z(df,col): """Use scipy to calcualte absoliute Z-scores and return boolean series where True indicates it is an outlier Args: df (Frame): DataFrame containing column to analyze col (str): Name of column to test. Returns: idx_outliers (Series): series of True/False for each row in col Ex: >> idx_outs = find_outliers(df['bedrooms']) >> df_clean = df.loc[idx_outs==False]""" from scipy import stats import numpy as np col = df[col] z = np.abs(stats.zscore(col)) idx_outliers = np.where(z>3,True,False) return idx_outliers
[docs]def find_outliers_IQR(df,col): """ Use Tukey's Method of outlier removal AKA InterQuartile-Range Rule and return boolean series where True indicates it is an outlier. - Calculates the range between the 75% and 25% quartiles - Outliers fall outside upper and lower limits, using a treshold of 1.5*IQR the 75% and 25% quartiles. IQR Range Calculation: res = df.describe() IQR = res['75%'] - res['25%'] lower_limit = res['25%'] - 1.5*IQR upper_limit = res['75%'] + 1.5*IQR Args: df ([type]): [description] col ([type]): [description] Returns: [type]: [description] """ res = df[col].describe() IQR = res['75%'] - res['25%'] lower_limit = res['25%'] - 1.5*IQR upper_limit = res['75%'] + 1.5*IQR idx_goodvals = (df[col]<upper_limit) & (df[col]>lower_limit) return ~idx_goodvals
[docs]def vif_ols(df,exclude_col = None, cat_cols = []): """ Performs variance inflation factor analysis on all columns in dataframe to identify Multicollinear data. The target column (indicated by exclude_col parameter) Args: df (Frame): data exclude_col (str): Column to exclude from OLS model. (for VIF calculations). cat_cols (list, optional): List of columns to treat as categories for make_ols_f Returns: res (Framee): DataFrame with results of VIF modeling (VIF and R2 score for each feature) """ # let's check each column, build a model and get the r2 import fsds_100719 as fs vif_scores = [['Column','VIF','R2']] if exclude_col is not None: df = df.drop(exclude_col,axis=1) for column in df.columns: columns_to_use = df.drop(columns=[column]).columns target = column linreg = make_ols_f(df, target=target, cat_cols=cat_cols, col_list=columns_to_use,show_summary=False) R2 = linreg.rsquared VIF = 1 / (1 - R2) # print(f"VIF for {column} = {VIF}") vif_scores.append([column, VIF, R2]) res = fs.ds.list2df(vif_scores,index_col='Column') res.sort_values('VIF',ascending=False,inplace=True) res['use']=res['VIF'] <5 return res