Source code for fsds_100719.ds

"""A shared collection of tools for general use."""

from ..jmi import dict_dropdown,get_attributes,get_methods,get_methods_attributes_df    
from .regression_project import *
from .tsa import *

# import os, sys 
# os.listdir('../ds')
# sys.path.append(".")
from .flatiron_stats import *
# import flatiron_stats  as flatiron_stats
# import fsds_100719.ds.flatiron_stats as flatiron_stats

[docs]def add_dir_to_path(abs_path=None,rel_path=None,verbose=True): """Adds the provided path (or current directory if None provided) to sys.path. Args: path (str): folder to add to path (May need to be absolute). rel_path (str): relative folder path to be converted to absolute and added. verbose (bool): Controls display of success/failure messages. Default =True""" import pathlib, os, sys # If no path provided: if abs_path is None: ## If no relative path provided, use current dir if rel_path is not None: if verbose: print(f"[i] Converting relative path '{rel_path}' to absolute.") import os os.chdir(rel_path) add_path = os.path.abspath(os.curdir) # print os.path.abspath(os.) else: add_path = os.path.abspath(os.curdir) ## Set add_path = to provided path else: add_path=abs_path ## Check if folder already in path if add_path in sys.path: print(f'[i] Path already in sys.path:\n\t- "{add_path}"') return ## otherwise append path else: sys.path.append(add_path) ## Check if if add_path in sys.path: if verbose: print(f'[Success] Successfully added to sys.path:\n\t -"{add_path}"') else: if verbose: print(f'[Error] Path was not added to path.') return
[docs]def ihelp(function_or_mod, show_help=True, show_code=True,return_code=False,markdown=True,file_location=False): """Call on any module or functon to display the object's help command printout AND/OR soruce code displayed as Markdown using Python-syntax""" import inspect try: from IPython.display import display, Markdown except: print('[!] IPython was not found.') page_header = '---'*28 # footer = '---'*28+'\n' if show_help: print(page_header) banner = ''.join(["---"*2,' HELP ',"---"*24,'\n']) print(banner) help(function_or_mod) # print(footer) import sys if "google.colab" in sys.modules: markdown=False if show_code: print(page_header) banner = ''.join(["---"*2,' SOURCE -',"---"*23]) print(banner) try: import inspect source_DF = inspect.getsource(function_or_mod) if markdown == True: output = "```python" +'\n'+source_DF+'\n'+"```" display(Markdown(output)) else: print(source_DF) except TypeError: pass # display(Markdown) if file_location: file_loc = inspect.getfile(function_or_mod) banner = ''.join(["---"*2,' FILE LOCATION ',"---"*21]) print(page_header) print(banner) print(file_loc) # print(footer) if return_code: return source_DF
[docs]def list2df(list, index_col=None, caption=None, return_df=True,df_kwds={}): #, sort_values='index'): """ Quick turn an appened list with a header (row[0]) into a pretty dataframe. Args list (list of lists): index_col (string): name of column to set as index; None (Default) has integer index. set_caption (string): show_and_return (bool): EXAMPLE USE: >> list_results = [["Test","N","p-val"]] # ... run test and append list of result values ... >> list_results.append([test_Name,length(data),p]) ## Displays styled dataframe if caption: >> df = list2df(list_results, index_col="Test", set_caption="Stat Test for Significance") """ from IPython.display import display import pandas as pd df_list = pd.DataFrame(list[1:],columns=list[0],**df_kwds) if index_col is not None: df_list.reset_index(inplace=True) df_list.set_index(index_col, inplace=True) if caption is not None: dfs = df_list.style.set_caption(caption) display(dfs) return df_list
[docs]def arr2series(array,series_index=None, series_name='array'): """ Converts an array into a named series. Args: array (numpy array): Array to transform. series_index (list, optional): List of values to be used as index. Defaults to None, a numerical index. series_name (str, optional): Name for series. Defaults to 'array'. Returns: converted_array: Pandas Series with the name and index specified. """ import pandas as pd if len(series_index)==0: series_index=list(range(len(array))) if len(series_index)>len(array): new_index= series_index[-len(array):] series_index=new_index converted_array = pd.Series(array.ravel(), index=series_index, name=series_name) return converted_array
[docs]def ihelp_menu(function_list,box_style='warning', to_embed=False):#, to_file=False):#, json_file='ihelp_output.txt' ): """ Creates a widget menu of the source code and and help documentation of the functions in function_list. Args: function_list (list): list of function object or string names of loaded function. to_embed (bool, optional): Returns interface (layout,output) if True. Defaults to False. to_file (bool, optional): Save . Defaults to False. json_file (str, optional): [description]. Defaults to 'ihelp_output.txt'. Returns: full_layout (ipywidgets GridBox): Layout of interface. output () """ # Accepts a list of string names for loaded modules/functions to save the `help` output and # inspect.getsource() outputs to dictionary for later reference and display ## One way using sys to write txt file import pandas as pd import sys import inspect from io import StringIO from IPython.display import display,Markdown notebook_output = sys.stdout result = StringIO() sys.stdout=result ## Turn single input into a list if isinstance(function_list,list)==False: function_list = [function_list] ## Make a dictionary of{function_name : function_object} functions_dict = dict() for fun in function_list: ## if input is a string, save string as key, and eval(function) as value if isinstance(fun, str): functions_dict[fun] = eval(fun) ## if input is a function, get the name of function using inspect and make key, function as value elif inspect.isfunction(fun): members= inspect.getmembers(fun) member_df = pd.DataFrame(members,columns=['param','values']).set_index('param') fun_name = member_df.loc['__name__'].values[0] functions_dict[fun_name] = fun ## Create an output dict to store results for functions output_dict = {} for fun_name, real_func in functions_dict.items(): output_dict[fun_name] = {} ## First save help help(real_func) output_dict[fun_name]['help'] = result.getvalue() ## Clear contents of io stream result.truncate(0) try: ## Next save source source_DF = inspect.getsource(real_func) # # if markdown == True: # output = "```python" +'\n'+source_DF+'\n'+"```" # display(Markdown(output)) # else: # output=source_DF print(source_DF) # output_dict[fun_name]['source'] = source_DF # print(inspect.getsource(real_func)) #eval(fun)))###f"{eval(fun)}")) except: # print("Source code for object was not found") print("Source code for object was not found") # finally: output_dict[fun_name]['source'] = result.getvalue() ## clear contents of io stream result.truncate(0) ## Get file location try: file_loc = inspect.getfile(real_func) print(file_loc) except: print("File location not found") output_dict[fun_name]['file_location'] =result.getvalue() ## clear contents of io stream result.truncate(0) ## Reset display back to notebook sys.stdout = notebook_output # if to_file==True: # with open(json_file,'w') as f: # import json # json.dump(output_dict,f) ## CREATE INTERACTIVE MENU from ipywidgets import interact, interactive, interactive_output import ipywidgets as widgets from IPython.display import display # from functions_combined_BEST import ihelp # import functions_combined_BEST as ji ## Check boxes check_help = widgets.Checkbox(description="Show 'help(func)'",value=True) check_source = widgets.Checkbox(description="Show source code",value=True) check_fileloc=widgets.Checkbox(description="Show source filepath",value=False) check_boxes = widgets.HBox(children=[check_help,check_source,check_fileloc]) ## dropdown menu (dropdown, label, button) dropdown = widgets.Dropdown(options=list(output_dict.keys())) label = widgets.Label('Function Menu') button = widgets.ToggleButton(description='Show/hide',value=False) ## Putting it all together title = widgets.Label('iHelp Menu: View Help and/or Source Code') menu = widgets.HBox(children=[label,dropdown,button]) titled_menu = widgets.VBox(children=[title,menu]) full_layout = widgets.GridBox(children=[titled_menu,check_boxes],box_style=box_style) ## Define output manager # show_output = widgets.Output() def dropdown_event(change): new_key = change.new output_display = output_dict[new_key] dropdown.observe(dropdown_event,names='values') def show_ihelp(display_help=button.value,function=dropdown.value, show_help=check_help.value,show_code=check_source.value, show_file=check_fileloc.value,ouput_dict=output_dict): from IPython.display import Markdown # import functions_combined_BEST as ji from IPython.display import display page_header = '---'*28 # import json # with open(json_file,'r') as f: # output_dict = json.load(f) func_dict = output_dict[function] source_code=None if display_help: if show_help: # display(print(func_dict['help'])) print(page_header) banner = ''.join(["---"*2,' HELP ',"---"*24,'\n']) print(banner) print(func_dict['help']) if show_code: print(page_header) banner = ''.join(["---"*2,' SOURCE -',"---"*23]) print(banner) source_code = func_dict['source']#.encode('utf-8') if source_code.startswith('`'): source_code = source_code.replace('`',"").encode('utf-8') if 'google.colab' in sys.modules: print(source_code) else: md_source = "```python\n"+source_code md_source += "```" display(Markdown(md_source)) if show_file: print(page_header) banner = ''.join(["---"*2,' FILE LOCATION ',"---"*21]) print(banner) file_loc = func_dict['file_location'] print(file_loc) if show_help==False & show_code==False & show_file==False: display('Check at least one "Show" checkbox for output.') else: display('Press "Show/hide" for display') ## Fully integrated output output = widgets.interactive_output(show_ihelp,{'display_help':button, 'function':dropdown, 'show_help':check_help, 'show_code':check_source, 'show_file':check_fileloc}) if to_embed: return full_layout, output else: display(full_layout, output)
[docs]def inspect_variables(local_vars = None,sort_col='size',exclude_funcs_mods=True, top_n=10,return_df=False,always_display=True, show_how_to_delete=False,print_names=False): """ Displays a dataframe of all variables and their size in memory, with the largest variables at the top. Args: local_vars (locals(): Must call locals() as first argument. sort_col (str, optional): column to sort by. Defaults to 'size'. top_n (int, optional): how many vars to show. Defaults to 10. return_df (bool, optional): If True, return df instead of just showing df.Defaults to False. always_display (bool, optional): Display df even if returned. Defaults to True. show_how_to_delete (bool, optional): Prints out code to copy-paste into cell to del vars. Defaults to False. print_names (bool, optional): [description]. Defaults to False. Raises: Exception: if locals() not passed as first arg Example Usage: # Must pass in local variables >> inspect_variables(locals()) # To see command to delete list of vars" >> inspect_variables(locals(),show_how_to_delete=True) """ import sys import inspect import pandas as pd from IPython.display import display if local_vars is None: raise Exception('Must pass "locals()" in function call. i.e. inspect_variables(locals())') glob_vars= [k for k in globals().keys()] loc_vars = [k for k in local_vars.keys()] var_list = glob_vars+loc_vars var_df = pd.DataFrame(columns=['variable','size','type']) exclude = ['In','Out'] var_list = [x for x in var_list if (x.startswith('_') == False) and (x not in exclude)] i=0 for var in var_list:#globals().items():#locals().items(): if var in loc_vars: real_var = local_vars[var] elif var in glob_vars: real_var = globals()[var] else: print(f"{var} not found.") var_size = sys.getsizeof(real_var) var_type = [] if inspect.isfunction(real_var): var_type = 'function' if exclude_funcs_mods: continue elif inspect.ismodule(real_var): var_type = 'module' if exclude_funcs_mods: continue elif inspect.isbuiltin(real_var): var_type = 'builtin' elif inspect.isclass(real_var): var_type = 'class' else: var_type = real_var.__class__.__name__ var_row = pd.Series({'variable':var,'size':var_size,'type':var_type}) var_df.loc[i] = var_row#pd.concat([var_df,var_row],axis=0)#.join(var_row,) i+=1 # if exclude_funcs_mods: # var_df = var_df.loc[var_df['type'] not in ['function', 'module'] ] var_df.sort_values(sort_col,ascending=False,inplace=True) var_df.reset_index(inplace=True,drop=True) var_df.set_index('variable',inplace=True) var_df = var_df[['type','size']] if top_n is not None: var_df = var_df.iloc[:top_n] if always_display: display(var_df.style.set_caption('Current Variables by Size in Memory')) if show_how_to_delete: print('---'*15) print('## CODE TO DELETE MANY VARS AT ONCE:') show_del_me_code(called_by_inspect_vars=True) if print_names ==False: print('#[i] set `print_names=True` for var names to copy/paste.') print('---'*15) else: print('---'*15) print('Variable Names:\n') print_me = [f"{str(x)}" for x in var_df.index] print(print_me) if show_del_me_code == False: print("[i] set `show_del_me_code=True prints copy/paste var deletion code.") if return_df: return var_df
[docs]def show_del_me_code(called_by_inspect_vars=False): """Prints code to copy and paste into a cell to delete vars using a list of their names. Companion function inspect_variables(locals(),print_names=True) will provide var names tocopy/paste """ from pprint import pprint if called_by_inspect_vars==False: print("#[i]Call: `inspect_variables(locals(), print_names=True)` for list of var names") del_me = """ del_me= []#list of variable names for me in del_me: try: exec(f'del {me}') print(f'del {me} succeeded') except: print(f'del {me} failed') continue """ print(del_me)
[docs]def reload(mod): """Reloads the module from file without restarting kernel. Args: mod (loaded mod or list of mod objects): name or handle of package (i.e.,[ pd, fs,np]) Returns: reload each model. Example: # You pass in whatever name you imported as. import my_functions_from_file as mf # after editing the source file: # mf.reload(mf)""" from importlib import reload print(f'Reloading...\n') ## if isinstance(mod,list): return [reload(m) for m in mod] else: return reload(mod)
# from ..jmi import print_array_info,print_docstring_template #dict_dropdown,
[docs]def save_ihelp_to_file(function,save_help=False,save_code=True, as_md=False,as_txt=True, folder='readme_resources/ihelp_outputs/', filename=None,file_mode='w'): """Saves the string representation of the ihelp source code as markdown. Filename should NOT have an extension. .txt or .md will be added based on as_md/as_txt. If filename is None, function name is used.""" if as_md & as_txt: raise Exception('Only one of as_md / as_txt may be true.') import sys from io import StringIO ## save original output to restore orig_output = sys.stdout ## instantiate io stream to capture output io_out = StringIO() ## Redirect output to output stream sys.stdout = io_out if save_code: print('### SOURCE:') help_md = get_source_code_markdown(function) ## print output to io_stream print(help_md) if save_help: print('### HELP:') help(function) ## Get printed text from io stream text_to_save = io_out.getvalue() ## MAKE FULL FILENAME if filename is None: ## Find the name of the function import re func_names_exp = re.compile(r'def (\w*)\(') func_name = func_names_exp.findall(text_to_save)[0] print(f'Found code for {func_name}') save_filename = folder+func_name#+'.txt' else: save_filename = folder+filename if as_md: ext = '.md' elif as_txt: ext='.txt' full_filename = save_filename + ext with open(full_filename,file_mode) as f: f.write(text_to_save) print(f'Output saved as {full_filename}') sys.stdout = orig_output
[docs]def get_source_code_markdown(function): """Retrieves the source code as a string and appends the markdown python syntax notation""" import inspect from IPython.display import display, Markdown source_DF = inspect.getsource(function) output = "```python" +'\n'+source_DF+'\n'+"```" return output
# def quick_ref(topic='student_resource_folder'): # """Displays quick reference url links and info. # Args: # topic (str): selects which reference info to show. # - `student_resource_folder` : data science gdrive url # - `fsds` :documentaion url""" # if 'student_resource_folder' in topic: # print('Data Science Student Resources:') # print('https://flatiron.online/StudentResourcesGdrive') # if 'fsds' in topic: # print('fsds_100719 Package Documentation:')
[docs]def show_off_vs_code(): pass
[docs]def check_column(panda_obj, columns=None,nlargest='all'): """ Prints column name, dataype, # and % of null values, and unique values for the nlargest # of rows (by valuecount_. it will only print results for those columns ************ Params: panda_object: pandas DataFrame or Series columns: list containing names of columns (strings) Returns: None prints values only """ import numpy as np import pandas as pd # Check for DF vs Series if type(panda_obj)==pd.core.series.Series: series=panda_obj print(f'\n----------------------------\n') print(f"Column: df['{series.name}']':") print(f"dtype: {series.dtype}") print(f"isna: {series.isna().sum()} out of {len(series)} - {round(series.isna().sum()/len(series)*100,3)}%") print(f'\nUnique non-na values:') if nlargest =='all': print(series.value_counts()) else: print(series.value_counts().nlargest(nlargest)) elif type(panda_obj)==pd.core.frame.DataFrame: df = panda_obj for col_name in df.columns: col = df[col_name] print("\n-----------------------------------------------") print(f"Column: df['{col_name}']':") print(f"dtype: {col.dtypes}") print(f"isna: {col.isna().sum()} out of {len(col)} - {round(col.isna().sum()/len(col)*100,3)}%") print(f'\nUnique non-na values:\nnlargest={nlargest}\n-----------------') if nlargest =='all': print(col.value_counts()) else: print(col.value_counts().nlargest(nlargest))
[docs]def check_df_for_columns(df, columns=None): """ Checks df for presence of columns. args: ********** df: pd.DataFrame to find columns in columns: str or list of str. column names """ if not columns: print('check_df_for_columns expected to be passed a list of column names.') else: for column in columns: if not column in df.columns: continue else: print(f'{column} is a valid column name') pass
[docs]def check_unique(df, columns=None): """ Prints unique values for all columns in dataframe. If passed list of columns, it will only print results for those columns 8************ > Params: df: pandas DataFrame, or pd.Series columns: list containing names of columns (strings) Returns: None prints values only """ from IPython.display import display import pandas as pd # check for columns # if columns is None: # Check if series, even though this is unnecesary because you could simply # Call pd.series.sort_values() if isinstance(df, pd.Series): # display all the value counts nunique = df.nunique() print(f'\n---------------------------\n') print(f"{df.name} Type: {df.dtype}\nNumber unique values: {nunique}") return pd.DataFrame(df.value_counts()) else: if columns is None: columns = df.columns for col in columns: nunique = df[col].nunique() unique_df = pd.DataFrame(df[col].value_counts()) print(f'\n---------------------------') print(f"\n{col} Type: {df[col].dtype}\nNumber unique values: {nunique}.") display(unique_df) pass
[docs]def check_numeric(df, columns=None, unique_check=False, return_list=False, show_df=False): """ Iterates through columns and checks for possible numeric features labeled as objects. Params: ****************** df: pandas DataFrame unique_check: bool. (default=True) If true, distplays interactive interface for checking unique values in columns. return_list: bool, (default=False) If True, returns a list of column names with possible numeric types. **********> Returns: dataframe displayed (always), list of column names if return_list=True """ # from .bs_ds import list2df from IPython.display import display display_list = [['Column', 'Numeric values','Total Values', 'Percent']] outlist = [] # print(f'\n---------------------------------------------------\n') # print(f'# of Identified Numeric Values in "Object" columns:') # Check for user column list columns_to_check = [] if columns is None: columns_to_check = df.columns else: columns_to_check = columns # Iterate through columns for col in columns_to_check: # Check for object dtype, if df[col].dtype == 'object': # If object, check for numeric if df[col].str.isnumeric().any(): # If numeric, get counts vals = df[col].str.isnumeric().sum() percent = round((df[col].str.isnumeric().sum()/len(df[col]))*100, 2) display_list.append([col, vals,len(df[col]), percent]) outlist.append(col) list2show = list2df(display_list) list2show.set_index('Column',inplace=True) styled_list2show = list2show.style.set_caption('# of Detected Numeric Values in "Object" columns:') if show_df==True: display(styled_list2show) if unique_check: unique = input("display unique values? (Enter 'y' for all columns, a column name, or 'n' to quit):") while unique != 'n': if unique == 'y': check_unique(df, outlist) break elif unique in outlist: name = [unique] check_unique(df, name) unique = input('Enter column name or n to quit:') if return_list==True: return styled_list2show, outlist else: return styled_list2show
[docs]def check_null(df, columns=None,show_df=False): """ Iterates through columns and checks for null values and displays # and % of column. Params: ****************** df: pandas DataFrame columns: list of columns to check **********> Returns: displayed dataframe """ from IPython.display import display # from .bs_ds import list2df display_list = [['Column', 'Null values', 'Total Values','Percent']] outlist = [] # print(f'\n----------------------------\n') # print(f'# of Identified Null Values:') # Check for user column list columns_to_check = [] if columns==None: columns_to_check = df.columns else: columns_to_check = columns # Iterate through columns for col in columns_to_check: # Check for object dtype, # if df[col].dtype == 'object': # If object, check for numeric # If numeric, get counts vals = df[col].isna().sum() percent = round((vals/len(df[col]))*100, 3) display_list.append([col, vals, len(df[col]), percent]) outlist.append(col) list2show=list2df(display_list) list2show.set_index('Column',inplace=True) styled_list2show = list2show.style.set_caption('# of Identified Null Values:') if show_df==True: display(styled_list2show) return styled_list2show
[docs]def compare_duplicates(df1, df2, to_drop=True, verbose=True, return_names_list=False): """ Compare two dfs for duplicate columns, drop if to_drop=True, useful to us before concatenating when dtypes are different between matching column names and df.drop_duplicates is not an option. Params: -------------------- df1, df2 : pandas dataframe suspected of having matching columns to_drop : bool, (default=True) If True will give the option of dropping columns one at a time from either column. verbose: bool (default=True) If True prints column names and types, set to false and return_names list=True if only desire a list of column names and no interactive interface. return_names_list: bool (default=False), If True, will return a list of all duplicate column names. -------------------- Returns: List of column names if return_names_list=True, else nothing. """ catch = [] dropped1 = [] dropped2 = [] if verbose: print("Column | df1 | df2 ") print("*----------------------*") # Loop through columns, inspect for duplicates for col in df1.columns: if col in df2.columns: catch.append(col) if verbose: print(f"{col} {df1[col].dtype} {df2[col].dtype}") # Accept user input and drop columns one by one if to_drop: choice = input("\nDrop this column? Enter 1. df1, 2. df2 or n for neither") if choice == "1": df1.drop(columns=col, axis=1, inplace=True) dropped1.append(col) elif choice == "2": df2.drop(columns=col, axis=1, inplace=True) dropped2.append(col) else: continue # Display dropped columns and orignating df if to_drop: if len(dropped1) >= 1: print(f"\nDropped from df1:\n{dropped1}") if len(dropped2) >= 1: print(f"\nDropped from df1:\n{dropped2}") if return_names_list: return catch else: pass
# ## Dataframes styling # def check_column(panda_obj, columns=None,nlargest='all'): # """ # Prints column name, dataype, # and % of null values, and unique values for the nlargest # of rows (by valuecount_. # it will only print results for those columns # ************ # Params: # panda_object: pandas DataFrame or Series # columns: list containing names of columns (strings) # Returns: None # prints values only # """ # import pandas as pd # # Check for DF vs Series # if type(panda_obj)==pd.core.series.Series: # series=panda_obj # print(f'\n----------------------------\n') # print(f"Column: df['{series.name}']':") # print(f"dtype: {series.dtype}") # print(f"isna: {series.isna().sum()} out of {len(series)} - {round(series.isna().sum()/len(series)*100,3)}%") # print(f'\nUnique non-na values:') # if nlargest =='all': # print(series.value_counts()) # else: # print(series.value_counts().nlargest(nlargest)) # elif type(panda_obj)==pd.core.frame.DataFrame: # df = panda_obj # for col_name in df.columns: # col = df[col_name] # print("\n-----------------------------------------------") # print(f"Column: df['{col_name}']':") # print(f"dtype: {col.dtypes}") # print(f"isna: {col.isna().sum()} out of {len(col)} - {round(col.isna().sum()/len(col)*100,3)}%") # print(f'\nUnique non-na values:\nnlargest={nlargest}\n-----------------') # if nlargest =='all': # print(col.value_counts()) # else: # print(col.value_counts().nlargest(nlargest)) ## DataFrame Creation, Inspection, and Exporting
[docs]def inspect_df(df, n_rows=3, verbose=True): """ EDA: Show all pandas inspection tables. Displays df.head(), df.info(), df.describe(). By default also runs check_null and check_numeric to inspect columns for null values and to check string columns to detect numeric values. (If verbose==True) Parameters: df(dataframe): dataframe to inspect n_rows: number of header rows to show (Default=3). verbose: If verbose==True (default), check_null and check_numeric. Ex: inspect_df(df,n_rows=4) """ # from bs_ds.bamboo import check_column, check_null, check_numeric, check_unique # from bs_ds.prettypandas import display_side_by_side import pandas as pd from IPython.display import display with pd.option_context("display.max_columns", None ,'display.precision',3): display(df.info()) #, display(df.describe()) if verbose == True: df_num = check_numeric(df,unique_check=False, show_df=False) # sdf_num = df_num.style.set_caption('Detected Numeric Values') df_null = check_null(df, show_df=False) # sdf_null = df_null.style.set_caption('Detected Null values') display_side_by_side(df_null, df_num,df.describe()) else: display(df.describe()) display(df.head(n_rows))
[docs]def display_side_by_side(*args): """Display all input dataframes side by side. Also accept captioned styler df object (df_in = df.style.set_caption('caption') Modified from Source: https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side""" from IPython.display import display_html import pandas html_str='' for df in args: if type(df) == pandas.io.formats.style.Styler: html_str+= ' ' html_str+=df.render() else: html_str+=df.to_html() display_html(html_str.replace('table','table style="display:inline"'),raw=True)
[docs]def column_report(df,index_col=None, sort_column='iloc', ascending=True, interactive=False, return_df=False): """ Displays a DataFrame summary of each column's: - name, iloc, dtypes, null value count & %, # of 0's, min, max,med,mean, etc Args: df (DataFrame): df to report index_col (column to set as index, str): Defaults to None. sort_column (str, optional): [description]. Defaults to 'iloc'. ascending (bool, optional): [description]. Defaults to True. as_df (bool, optional): [description]. Defaults to False. interactive (bool, optional): [description]. Defaults to False. return_df (bool, optional): [description]. Defaults to False. Returns: column_report (df): Non-styled version of displayed df report """ from ipywidgets import interact import pandas as pd from IPython.display import display def count_col_zeros(df, columns=None): import pandas as pd import numpy as np # Make a list of keys for every column (for series index) zeros = pd.Series(index=df.columns) # use all cols by default if columns is None: columns=df.columns # get sum of zero values for each column for col in columns: zeros[col] = np.sum( df[col].values == 0) return zeros ## df_report = pd.DataFrame({'.iloc[:,i]': range(len(df.columns)), 'column name':df.columns, 'dtypes':df.dtypes.astype('str'), '.isna()': df.isna().sum().round(), '% na':df.isna().sum().divide(df.shape[0]).mul(100).round(2), '# zeros': count_col_zeros(df), '# unique':df.nunique(), 'min':df.min(), 'max':df.max(), 'med':df.describe().loc['50%'], 'mean':df.mean().round(2)})# ## Sort by index_col if index_col is not None: hide_index=False if 'iloc' in index_col: index_col = '.iloc[:,i]' df_report.set_index(index_col ,inplace=True) else: hide_index=True ## Sort column if sort_column is None: sort_column = '.iloc[:,i]' if 'iloc' in sort_column: sort_column = '.iloc[:,i]' df_report.sort_values(by =sort_column,ascending=ascending, axis=0, inplace=True) dfs = df_report.style.set_caption('Column Report') if hide_index: display(dfs.hide_index()) else: display(dfs) if interactive: @interact(column= df_report.columns,direction={'ascending':True,'descending':False}) def sort_df(column, direction): return df_report.sort_values(by=column,axis=0,ascending=direction) if return_df: return df_report
[docs]def column_report_qgrid(df,index_col=None, sort_column='iloc', ascending=True, format_dict=None, as_df = False, as_interactive_df=False, show_and_return=True, as_qgrid=True, qgrid_options=None, qgrid_column_options=None, qgrid_col_defs=None, qgrid_callback=None): """ Returns a datafarme summary of the columns, their dtype, a summary dataframe with the column name, column dtypes, and a `decision_map` dictionary of datatype. [!] Please note if qgrid does not display properly, enter this into your terminal and restart your temrinal. 'jupyter nbextension enable --py --sys-prefix qgrid'# required for qgrid 'jupyter nbextension enable --py --sys-prefix widgetsnbextension' # only required if you have not enabled the ipywidgets nbextension yet Default qgrid options: default_grid_options={ # SlickGrid options 'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defaultColumnWidth': 50, 'rowHeight': 25, 'enableColumnReorder': True, 'enableTextSelectionOnCells': True, 'editable': True, 'autoEdit': False, 'explicitInitialization': True, # Qgrid options 'maxVisibleRows': 30, 'minVisibleRows': 8, 'sortable': True, 'filterable': True, 'highlightSelectedCell': True, 'highlightSelectedRow': True } """ from ipywidgets import interact import pandas as pd from IPython.display import display import qgrid small_col_width = 20 # default_col_options={'width':20} default_column_definitions={'column name':{'width':60}, '.iloc[:,i]':{'width':small_col_width}, 'dtypes':{'width':30}, '# zeros':{'width':small_col_width}, '# null':{'width':small_col_width}, '% null':{'width':small_col_width}}#, # name_for_notes_col:{'width':100}} default_grid_options={ # SlickGrid options 'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defaultColumnWidth': 50, 'rowHeight': 25, 'enableColumnReorder': True, 'enableTextSelectionOnCells': True, 'editable': True, 'autoEdit': False, 'explicitInitialization': True, # Qgrid options 'maxVisibleRows': 30, 'minVisibleRows': 8, 'sortable': True, 'filterable': True, 'highlightSelectedCell': True, 'highlightSelectedRow': True } ## Set the params to defaults, to then be overriden column_definitions = default_column_definitions grid_options=default_grid_options # column_options = default_col_options if qgrid_options is not None: for k,v in qgrid_options.items(): grid_options[k]=v if qgrid_col_defs is not None: for k,v in qgrid_col_defs.items(): column_definitions[k]=v else: column_definitions = default_column_definitions # format_dict = {'sum':'${0:,.0f}', 'date': '{:%m-%Y}', 'pct_of_total': '{:.2%}'} # monthly_sales.style.format(format_dict).hide_index() def count_col_zeros(df, columns=None): import pandas as pd import numpy as np # Make a list of keys for every column (for series index) zeros = pd.Series(index=df.columns) # use all cols by default if columns is None: columns=df.columns # get sum of zero values for each column for col in columns: zeros[col] = np.sum( df[col].values == 0) return zeros ## df_report = pd.DataFrame({'.iloc[:,i]': range(len(df.columns)), 'column name':df.columns, 'dtypes':df.dtypes.astype('str'), '.isna()': df.isna().sum().round(), '% na':df.isna().sum().divide(df.shape[0]).mul(100).round(2), '# zeros': count_col_zeros(df), '# unique':df.nunique(), 'min':df.min(), 'max':df.max(), 'med':df.describe().loc['50%'], 'mean':df.mean().round(2)})# ## Sort by index_col if index_col is not None: hide_index=False if 'iloc' in index_col: index_col = '.iloc[:,i]' df_report.set_index(index_col ,inplace=True) else: hide_index=True ## Sort column if sort_column is None: sort_column = '.iloc[:,i]' if 'iloc' in sort_column: sort_column = '.iloc[:,i]' df_report.sort_values(by =sort_column,ascending=ascending, axis=0, inplace=True) ##If running from colab, override qgrid import sys if ('google.colab' in sys.modules )& (as_qgrid==True) : as_df=True as_qgrid=False if as_df: if show_and_return: dfs = df_report.style.set_caption('Column Report') if hide_index: display(dfs.hide_index()) else: display(dfs) return df_report elif as_qgrid: print('[i] qgrid returned. Use gqrid.get_changed_df() to get edited df back.') qdf = qgrid.show_grid(df_report,grid_options=grid_options, column_options=qgrid_column_options, column_definitions=column_definitions,row_edit_callback=qgrid_callback ) if show_and_return: display(qdf) return qdf elif as_interactive_df: @interact(column= df_report.columns,direction={'ascending':True,'descending':False}) def sort_df(column, direction): return df_report.sort_values(by=column,axis=0,ascending=direction) else: raise Exception('One of the output options must be true: `as_qgrid`,`as_df`,`as_interactive_df`')
#################### GENERAL HELPER FUNCTIONS #####################
[docs]def is_var(name): x=[] try: eval(name) except NameError: x = None if x is None: return False else: return True
[docs]def capture_text(txt): """Uses StringIO and sys.stdout to capture print statements. Args: txt (str): pass string or command to display a string to capture Returns: txt_out (str): captured print statement""" import sys from io import StringIO notebook_output = sys.stdout result = StringIO() sys.stdout=result print(txt) txt_out = result.getvalue() sys.stdout=notebook_output return txt_out
[docs]def find_outliers_zscore(col): """Use scipy to calcualte absoliute Z-scores and return boolean series where True indicates it is an outlier Args: col (Series): a series/column from your DataFrame Returns: idx_outliers (Series): series of True/False for each row in col Ex: >> idx_outs = find_outliers(df['bedrooms']) >> df_clean = df.loc[idx_outs==False]""" from scipy import stats import numpy as np import pandas as pd z = np.abs(stats.zscore(col)) idx_outliers = np.where(z>3,True,False) return pd.Series(idx_outliers,index=col.index)