Source code for fsds_100719.ds.regression_project

"""A Collection of functions from ft study group for section 25."""
if __name__=='__main__':
    import matplotlib.pyplot as plt
    import numpy as np
    import statsmodels.api as sms
    import statsmodels.formula.api as smf

[docs]def make_ols_f(df,target='price',cat_cols = [],
               col_list=None, show_summary=True,exclude_cols=[]):
    """
    Uses the formula api of Statsmodels for ordinary least squares regression.
    
    Args:
        df (Frame): data
        target (str, optional): Column to predict. Defaults to 'price'.
        cat_cols (list, optional): Columns to treat as categorical (and one-hot).
        col_list ([type], optional): List of columns to use. Defaults to all columns besides exclude_cols.
        show_summary (bool, optional): Display the model.summary() before returning model. Defaults to True.
        exclude_cols (list, optional): List of column names to exclude. Defaults to []. 
            - Note: if a column name doesn't appear in the dataframe, there will be no error nor warning message.
    
    Returns:
        model: The fit statsmodels OLS model
    """
    import statsmodels.api as sms
    import statsmodels.formula.api as smf
    from IPython.display import display
    
    if col_list is None:
        col_list = list(df.drop(target,axis=1).columns)
        
    ## remove exclude cols
    [col_list.remove(ecol) for ecol in exclude_cols if ecol in col_list]

    features = '+'.join(col_list)


    for col in cat_cols:
        features = features.replace(col,f"C({col})")



    formula = target+'~'+features #target~predictors

    model = smf.ols(formula=formula, data=df).fit()
    
    if show_summary:
        display(model.summary())

    return model

## diagnostic function

[docs]def diagnose_model(model):
    """
    Displays the QQplot and residuals of the model.    
    Args:
        model (statsmodels ols): A fit statsmodels ols model.
    
    Returns:
        fig (Figure): Figure object for output figure
        ax (list): List of axes for subplots. 
    """
    
    import matplotlib.pyplot as plt
    import statsmodels.api as sms
    import statsmodels.formula.api as smf
    import scipy.stats as stats
    
    resids = model.resid
    
    fig,ax = plt.subplots(ncols=2,figsize=(10,5))
    sms.qqplot(resids, stats.distributions.norm,
              fit=True, line='45',ax=ax[0])
    xs = np.linspace(0,1,len(resids))
    ax[1].scatter(x=xs,y=resids)
    
    return fig,ax 

[docs]def find_outliers_Z(df,col):
    """Use scipy to calcualte absoliute Z-scores 
    and return boolean series where True indicates it is an outlier
    
    Args:
        df (Frame): DataFrame containing column to analyze
        col (str): Name of column to test.
        
    Returns:
        idx_outliers (Series): series of  True/False for each row in col
        
    Ex:
    >> idx_outs = find_outliers(df['bedrooms'])
    >> df_clean = df.loc[idx_outs==False]"""
    from scipy import stats
    import numpy as np


    col = df[col]
    z = np.abs(stats.zscore(col))
    idx_outliers = np.where(z>3,True,False)
    return idx_outliers


[docs]def find_outliers_IQR(df,col):
    """
    Use Tukey's Method of outlier removal AKA InterQuartile-Range Rule
    and return boolean series where True indicates it is an outlier.
    - Calculates the range between the 75% and 25% quartiles
    - Outliers fall outside upper and lower limits, using a treshold of  1.5*IQR the 75% and 25% quartiles.
    
    IQR Range Calculation:    
        res = df.describe()
        IQR = res['75%'] -  res['25%']
        lower_limit = res['25%'] - 1.5*IQR
        upper_limit = res['75%'] + 1.5*IQR
    
    Args:
        df ([type]): [description]
        col ([type]): [description]
    
    Returns:
        [type]: [description]
    """
    res = df[col].describe()
    IQR = res['75%'] -  res['25%']
    lower_limit = res['25%'] - 1.5*IQR
    upper_limit = res['75%'] + 1.5*IQR
    
    idx_goodvals = (df[col]<upper_limit) & (df[col]>lower_limit) 
    
    return ~idx_goodvals


[docs]def vif_ols(df,exclude_col = None, cat_cols = []):
    """
    Performs variance inflation factor analysis on all columns in dataframe
    to identify Multicollinear data. The target column (indicated by exclude_col parameter)
        
    Args:
        df (Frame): data
        exclude_col (str): Column to exclude from OLS model. (for VIF calculations).
        cat_cols (list, optional): List of columns to treat as categories for make_ols_f
    
    Returns:
        res (Framee): DataFrame with results of VIF modeling (VIF and R2 score for each feature)
    """
    
    # let's check each column, build a model and get the r2
    import fsds_100719 as fs
    vif_scores = [['Column','VIF','R2']]

    if exclude_col is not None:
        df = df.drop(exclude_col,axis=1)
        
    for column in df.columns:
        columns_to_use = df.drop(columns=[column]).columns
        target = column
        linreg = make_ols_f(df, target=target, cat_cols=cat_cols,
                            col_list=columns_to_use,show_summary=False)
        R2 = linreg.rsquared
        VIF = 1 / (1 - R2)
    #     print(f"VIF for {column} = {VIF}")
        vif_scores.append([column, VIF, R2])

    res = fs.ds.list2df(vif_scores,index_col='Column')
    res.sort_values('VIF',ascending=False,inplace=True)
    res['use']=res['VIF'] <5
    return res