Source code for fsds_100719.ft.hakkeray

"""My Template Module 
Name: Ru Keïn
Email: rukeine@gmail.com
GitHub Profile: https://github.com/hakkeray
"""
# import fsds_100719 as fs 
from fsds_100719.ds import ihelp,ihelp_menu, reload
from fsds_100719.jmi import print_docstring_template
# print(f"[i] You're using V {fs.__version__} of fsds.")
# HOT_STATS() function: display statistical summaries of a feature column

[docs]def hot_stats(data, column, verbose=False, t=None): """ Scans the values of a column within a dataframe and displays its datatype, nulls (incl. pct of total), unique values, non-null value counts, and statistical info (if the datatype is numeric). --------------------------------------------- Parameters: **args: data: accepts dataframe column: accepts name of column within dataframe (should be inside quotes '') **kwargs: verbose: (optional) accepts a boolean (default=False); verbose=True will display all unique values found. t: (optional) accepts column name as target to calculate correlation coefficient against using pandas data.corr() function. ------------- Examples: hot_stats(df, 'str_column') --> where df = data, 'string_column' = column you want to scan hot_stats(df, 'numeric_column', t='target') --> where 'target' = column to check correlation value ----------------- Developer notes: additional features to add in the future: -get mode(s) -functionality for string objects -pass multiple columns at once and display all ----------------- SAMPLE OUTPUT: **************************************** --------> HOT!STATS <-------- CONDITION Data Type: int64 count 21597.000000 mean 3.409825 std 0.650546 min 1.000000 25% 3.000000 50% 3.000000 75% 4.000000 max 5.000000 Name: condition, dtype: float64 à-la-Mode: 0 3 dtype: int64 No Nulls Found! Non-Null Value Counts: 3 14020 4 5677 5 1701 2 170 1 29 Name: condition, dtype: int64 # Unique Values: 5 """ # assigns variables to call later as shortcuts feature = data[column] rdash = "-------->" ldash = "<--------" # figure out which hot_stats to display based on dtype if feature.dtype == 'float': hot_stats = feature.describe().round(2) elif feature.dtype == 'int': hot_stats = feature.describe() elif feature.dtype == 'object' or 'category' or 'datetime64[ns]': hot_stats = feature.agg(['min','median','max']) t = None # ignores corr check for non-numeric dtypes by resetting t else: hot_stats = None # display statistics (returns different info depending on datatype) print(rdash) print("HOT!STATS") print(ldash) # display column name formatted with underline print(f"\n{feature.name.upper()}") # display the data type print(f"Data Type: {feature.dtype}\n") # display the mode print(hot_stats,"\n") print(f"à-la-Mode: \n{feature.mode()}\n") # find nulls and display total count and percentage if feature.isna().sum() > 0: print(f"Found\n{feature.isna().sum()} Nulls out of {len(feature)}({round(feature.isna().sum()/len(feature)*100,2)}%)\n") else: print("\nNo Nulls Found!\n") # display value counts (non-nulls) print(f"Non-Null Value Counts:\n{feature.value_counts()}\n") # display count of unique values print(f"# Unique Values: {len(feature.unique())}\n") # displays all unique values found if verbose set to true if verbose == True: print(f"Unique Values:\n {feature.unique()}\n") # display correlation coefficient with target for numeric columns: if t != None: corr = feature.corr(data[t]).round(4) print(f"Correlation with {t.upper()}: {corr}")