"""My Template Module
Name: Ru Keïn
Email: rukeine@gmail.com
GitHub Profile: https://github.com/hakkeray
"""
# import fsds_100719 as fs
from fsds_100719.ds import ihelp,ihelp_menu, reload
from fsds_100719.jmi import print_docstring_template
# print(f"[i] You're using V {fs.__version__} of fsds.")
# HOT_STATS() function: display statistical summaries of a feature column
[docs]def hot_stats(data, column, verbose=False, t=None):
"""
Scans the values of a column within a dataframe and displays its datatype,
nulls (incl. pct of total), unique values, non-null value counts, and
statistical info (if the datatype is numeric).
---------------------------------------------
Parameters:
**args:
data: accepts dataframe
column: accepts name of column within dataframe (should be inside quotes '')
**kwargs:
verbose: (optional) accepts a boolean (default=False); verbose=True will display all
unique values found.
t: (optional) accepts column name as target to calculate correlation coefficient against
using pandas data.corr() function.
-------------
Examples:
hot_stats(df, 'str_column') --> where df = data, 'string_column' = column you want to scan
hot_stats(df, 'numeric_column', t='target') --> where 'target' = column to check correlation value
-----------------
Developer notes: additional features to add in the future:
-get mode(s)
-functionality for string objects
-pass multiple columns at once and display all
-----------------
SAMPLE OUTPUT:
****************************************
-------->
HOT!STATS
<--------
CONDITION
Data Type: int64
count 21597.000000
mean 3.409825
std 0.650546
min 1.000000
25% 3.000000
50% 3.000000
75% 4.000000
max 5.000000
Name: condition, dtype: float64
à-la-Mode:
0 3
dtype: int64
No Nulls Found!
Non-Null Value Counts:
3 14020
4 5677
5 1701
2 170
1 29
Name: condition, dtype: int64
# Unique Values: 5
"""
# assigns variables to call later as shortcuts
feature = data[column]
rdash = "-------->"
ldash = "<--------"
# figure out which hot_stats to display based on dtype
if feature.dtype == 'float':
hot_stats = feature.describe().round(2)
elif feature.dtype == 'int':
hot_stats = feature.describe()
elif feature.dtype == 'object' or 'category' or 'datetime64[ns]':
hot_stats = feature.agg(['min','median','max'])
t = None # ignores corr check for non-numeric dtypes by resetting t
else:
hot_stats = None
# display statistics (returns different info depending on datatype)
print(rdash)
print("HOT!STATS")
print(ldash)
# display column name formatted with underline
print(f"\n{feature.name.upper()}")
# display the data type
print(f"Data Type: {feature.dtype}\n")
# display the mode
print(hot_stats,"\n")
print(f"à-la-Mode: \n{feature.mode()}\n")
# find nulls and display total count and percentage
if feature.isna().sum() > 0:
print(f"Found\n{feature.isna().sum()} Nulls out of {len(feature)}({round(feature.isna().sum()/len(feature)*100,2)}%)\n")
else:
print("\nNo Nulls Found!\n")
# display value counts (non-nulls)
print(f"Non-Null Value Counts:\n{feature.value_counts()}\n")
# display count of unique values
print(f"# Unique Values: {len(feature.unique())}\n")
# displays all unique values found if verbose set to true
if verbose == True:
print(f"Unique Values:\n {feature.unique()}\n")
# display correlation coefficient with target for numeric columns:
if t != None:
corr = feature.corr(data[t]).round(4)
print(f"Correlation with {t.upper()}: {corr}")