Source code for fsds_100719.datasets


"""A collection of convenient csv urls and sklearn datasets as dataframes."""
[docs]def load_data(*args,**kwargs): raise Exception('load_data() has been replaced by individual load functions. i.e. fs.datasets.load_boston()')
[docs]def read_csv_from_url(url,verbose=False,read_csv_kwds={}): """Loading function to load all .csv datasets. Args: url (str): csv raw link verbose (bool): Controls display of loading message and .head() read_csv_kwds (dict): dict of commands to feed to pd.read_csv() Returns: df (DataFrame): the dataset(""" import pandas as pd from IPython.display import display ## Load and return dataset # if verbose: # print(f"[i] Loading {dataset} from url:\n{url}") if read_csv_kwds is not None: df = pd.read_csv(url,**read_csv_kwds) else: df = pd.read_csv(url) if verbose: display(df.head()) return df
[docs]def load_heroes_info(verbose=False,read_csv_kwds={}): url = 'https://raw.githubusercontent.com/jirvingphd/dsc-data-cleaning-project-online-ds-ft-100719/master/heroes_information.csv' return read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)
[docs]def load_heroes_powers(verbose=False,read_csv_kwds={}): url = "https://raw.githubusercontent.com/learn-co-students/dsc-data-cleaning-project-online-ds-ft-100719/master/super_hero_powers.csv" return read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)
[docs]def load_titanic(verbose=False,read_csv_kwds={}): url ="https://raw.githubusercontent.com/jirvingphd/dsc-dealing-missing-data-lab-online-ds-ft-100719/master/titanic.csv" return read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)
[docs]def load_mod1_proj(verbose=False,read_csv_kwds={}): url = "https://raw.githubusercontent.com/learn-co-students/dsc-v2-mod1-final-project-online-ds-ft-100719/master/kc_house_data.csv" return read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)
[docs]def load_population(verbose=False,read_csv_kwds={}): url = "https://raw.githubusercontent.com/learn-co-students/dsc-subplots-and-enumeration-lab-online-ds-ft-100719/master/population.csv" return read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)
[docs]def load_autompg(verbose=True,read_csv_kwds={}): if verbose: print('[i] Source url with details: https://www.kaggle.com/uciml/autompg-dataset') url = 'https://raw.githubusercontent.com/jirvingphd/dsc-dealing-with-categorical-variables-online-ds-ft-100719/master/auto-mpg.csv' return read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)
[docs]def load_boston(verbose=False): ## Load Sklearn Datasets from sklearn import datasets import pandas as pd if verbose: print("[i] Loading boston housing dataset from sklearn.datasets") ## load data dict data_dict = datasets.load_boston() # load features df_features = pd.DataFrame(data_dict['data'],columns=data_dict['feature_names']) # load targets] df_features['price'] =data_dict['target'] # set output df df = df_features if verbose: print(data_dict['DESCR']) return df
[docs]def load_iris(verbose=False): from sklearn import datasets import pandas as pd if verbose: print('[i] Loading iris datset from sklearn.datasets') data_dict = datasets.load_iris() # Get dataframe df_features = pd.DataFrame(data_dict['data'],columns=data_dict['feature_names']) df_features['target'] = data_dict['target'] # Get mapper for target names iris_map = dict(zip( list(set(data_dict['target'])), data_dict['target_names']) ) df_features['target_name']=df_features['target'].map(iris_map) df = df_features if verbose: print(data_dict['DESCR']) return df
[docs]def load_height_weight(verbose=False,read_csv_kwds={}): """Loads height vs weight dataset""" url='https://raw.githubusercontent.com/jirvingphd/dsc-probability-density-function-online-ds-ft-100719/master/weight-height.csv' return read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)
[docs]def load_iowa_prisoners(verbose=False,vers='raw',read_csv_kwds={}): import pandas as pd if 'raw' in vers: url ='https://raw.githubusercontent.com/jirvingphd/dsc-3-final-project-online-ds-ft-021119/master/datasets/FULL_3-Year_Recidivism_for_Offenders_Released_from_Prison_in_Iowa.csv' else: url = 'https://raw.githubusercontent.com/jirvingphd/dsc-3-final-project-online-ds-ft-021119/master/datasets/Iowa_Prisoners_Renamed_Columns_fsds_100719.csv' df = read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)#pd.read_csv(url_iowa_raw,index_col=0) #pd.set_option('display.precision',3) return df
[docs]def load_height_by_country(verbose=False,read_csv_kwds={}): url='https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/height_by_country_age18.csv' df = read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)#pd.read_csv(url_iowa_raw,index_col=0) if verbose: source="http://ncdrisc.org/data-downloads-height.html" print(f'Source of dataset: {source}') return df
### TIME SERIES # baltimore_crime ="https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/BPD_Part_1_Victim_Based_Crime_Data.csv" # std_rates = "https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/STD%20Cases.csv" # no_sex_xlsx = "https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/Americans%20Sex%20Frequency.xlsx" # learn_passengers="https://raw.githubusercontent.com/learn-co-students/dsc-removing-trends-lab-online-ds-ft-100719/master/passengers.csv"
[docs]def load_ts_baltimore_crime_full(read_csv_kwds={}): url ="https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/BPD_Part_1_Victim_Based_Crime_Data.csv" return read_csv_from_url(url, verbose=False,read_csv_kwds=read_csv_kwds)
### TIME SERIES
[docs]def load_ts_baltimore_crime_counts(read_csv_kwds={}): url="https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/baltimore_crime_counts_2014-2019.csv" return read_csv_from_url(url, verbose=False,read_csv_kwds=read_csv_kwds)
[docs]def load_ts_mintemp(verbose=False,read_csv_kwds={}): """Loads min_temp.csv from """ if verbose: print("From Introduction to Time Series") url='https://raw.githubusercontent.com/jirvingphd/dsc-introduction-to-time-series-online-ds-ft-100719/master/min_temp.csv' return read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)
[docs]def load_ts_nyse_monthly(verbose=False,read_csv_kwds={}): """Loads NYSE_.csv from """ if verbose: print("From Introduction to Time Series") url='https://raw.githubusercontent.com/jirvingphd/dsc-introduction-to-time-series-online-ds-ft-100719/master/NYSE_monthly.csv' return read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)
[docs]def load_ts_exch_rates(verbose=False,read_csv_kwds={}): # if verbose: url="https://raw.githubusercontent.com/jirvingphd/dsc-basic-time-series-models-online-ds-ft-100719/master/exch_rates.csv" return read_csv_from_url(url, verbose=verbose, read_csv_kwds=read_csv_kwds)
[docs]def load_ts_winning_400m(read_csv_kwds={}): url="https://raw.githubusercontent.com/jirvingphd/dsc-arma-models-lab-online-ds-ft-100719/master/winning_400m.csv" return read_csv_from_url(url,verbose=False, read_csv_kwds=read_csv_kwds)
[docs]def load_ts_std_cases(read_csv_kwds={}): url = 'https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/STD%20Cases.csv' return read_csv_from_url(url,verbose=False, read_csv_kwds=read_csv_kwds)
[docs]def load_ts_american_sex_frequency(read_csv_kwds={}): url = 'https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/Americans%20Sex%20Frequency.xlsx' import pandas as pd return pd.read_excel(url,**read_csv_kwds)
# return read_csv_from_url(url,verbose=False, read_csv_kwds=read_csv_kwds) # def load_ts_co2(read_csv_kwds={}): # import statsmodels.api as sm # df = sm.datasets.co2.load() # return df
[docs]def load_AB_multiple_choice(verbose=False,read_csv_kwds={}): url='https://raw.githubusercontent.com/jirvingphd/dsc-in-depth-ab-testing-lab-online-ds-pt-100719/master/multipleChoiceResponses_cleaned.csv' df = read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)#pd.read_csv(url_iowa_raw,index_col=0) if verbose: from IPython.display import display display(df.head()) return df