Source code for fsds_100719.datasets


"""A collection of convenient csv urls and sklearn datasets as dataframes."""
[docs]def load_data(*args,**kwargs):
    raise Exception('load_data() has been replaced by individual load functions. i.e. fs.datasets.load_boston()')



[docs]def read_csv_from_url(url,verbose=False,read_csv_kwds={}):
    """Loading function to load all .csv datasets.
    Args:
        url (str): csv raw link
        verbose (bool): Controls display of loading message and .head()
        read_csv_kwds (dict): dict of commands to feed to pd.read_csv()
    Returns:
        df (DataFrame): the dataset("""
    import pandas as pd
    from IPython.display import display
    ## Load and return dataset
    # if verbose: 
        # print(f"[i] Loading {dataset} from url:\n{url}")
    if read_csv_kwds is not None:
        df = pd.read_csv(url,**read_csv_kwds)
    else:
        df = pd.read_csv(url)
    if verbose:
        display(df.head())
    return df


[docs]def load_heroes_info(verbose=False,read_csv_kwds={}):
    url = 'https://raw.githubusercontent.com/jirvingphd/dsc-data-cleaning-project-online-ds-ft-100719/master/heroes_information.csv'
    return  read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)
    

[docs]def load_heroes_powers(verbose=False,read_csv_kwds={}):
    url = "https://raw.githubusercontent.com/learn-co-students/dsc-data-cleaning-project-online-ds-ft-100719/master/super_hero_powers.csv"
    return  read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)

[docs]def load_titanic(verbose=False,read_csv_kwds={}):
    url ="https://raw.githubusercontent.com/jirvingphd/dsc-dealing-missing-data-lab-online-ds-ft-100719/master/titanic.csv"
    return  read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)


[docs]def load_mod1_proj(verbose=False,read_csv_kwds={}):
    url = "https://raw.githubusercontent.com/learn-co-students/dsc-v2-mod1-final-project-online-ds-ft-100719/master/kc_house_data.csv"
    return  read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)
        

[docs]def load_population(verbose=False,read_csv_kwds={}):
    url = "https://raw.githubusercontent.com/learn-co-students/dsc-subplots-and-enumeration-lab-online-ds-ft-100719/master/population.csv"
    return  read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)

[docs]def load_autompg(verbose=True,read_csv_kwds={}):
    
    if verbose:
        print('[i] Source url with details: https://www.kaggle.com/uciml/autompg-dataset')
    
    url = 'https://raw.githubusercontent.com/jirvingphd/dsc-dealing-with-categorical-variables-online-ds-ft-100719/master/auto-mpg.csv'
    return  read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)




[docs]def load_boston(verbose=False):
    
    ## Load Sklearn Datasets
    from sklearn import datasets
    import pandas as pd 
    
    if verbose:
        print("[i] Loading boston housing dataset from sklearn.datasets")
    ## load data dict
    data_dict =  datasets.load_boston()
    # load features
    df_features = pd.DataFrame(data_dict['data'],columns=data_dict['feature_names'])
    # load targets]
    df_features['price'] =data_dict['target']
    
    # set output df
    df = df_features
    if verbose:
        print(data_dict['DESCR'])
    
    return df 

[docs]def load_iris(verbose=False):
    from sklearn import datasets
    import pandas as pd
    if verbose:
        print('[i] Loading iris datset from sklearn.datasets')
    data_dict =  datasets.load_iris()
    
    # Get dataframe
    df_features = pd.DataFrame(data_dict['data'],columns=data_dict['feature_names'])
    df_features['target'] = data_dict['target']


    # Get mapper for target names
    iris_map = dict(zip( 
        list(set(data_dict['target'])),
        data_dict['target_names'])
                )
    df_features['target_name']=df_features['target'].map(iris_map)
    df = df_features
    if verbose:
        print(data_dict['DESCR'])   
    return df


[docs]def load_height_weight(verbose=False,read_csv_kwds={}):
    """Loads height vs weight dataset"""
    url='https://raw.githubusercontent.com/jirvingphd/dsc-probability-density-function-online-ds-ft-100719/master/weight-height.csv'
    return  read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)


[docs]def load_iowa_prisoners(verbose=False,vers='raw',read_csv_kwds={}):
    import pandas as pd
    if 'raw' in vers:
        url ='https://raw.githubusercontent.com/jirvingphd/dsc-3-final-project-online-ds-ft-021119/master/datasets/FULL_3-Year_Recidivism_for_Offenders_Released_from_Prison_in_Iowa.csv'
    else:
        url = 'https://raw.githubusercontent.com/jirvingphd/dsc-3-final-project-online-ds-ft-021119/master/datasets/Iowa_Prisoners_Renamed_Columns_fsds_100719.csv'
    df = read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)#pd.read_csv(url_iowa_raw,index_col=0)
    #pd.set_option('display.precision',3)
    return df

[docs]def load_height_by_country(verbose=False,read_csv_kwds={}):
    url='https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/height_by_country_age18.csv'
    df = read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)#pd.read_csv(url_iowa_raw,index_col=0)

    if verbose:
        source="http://ncdrisc.org/data-downloads-height.html"
        print(f'Source of dataset: {source}')
        
    return df



### TIME SERIES

# baltimore_crime ="https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/BPD_Part_1_Victim_Based_Crime_Data.csv"
# std_rates = "https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/STD%20Cases.csv"
# no_sex_xlsx = "https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/Americans%20Sex%20Frequency.xlsx"

# learn_passengers="https://raw.githubusercontent.com/learn-co-students/dsc-removing-trends-lab-online-ds-ft-100719/master/passengers.csv"

[docs]def load_ts_baltimore_crime_full(read_csv_kwds={}):
    url ="https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/BPD_Part_1_Victim_Based_Crime_Data.csv"
    return  read_csv_from_url(url, verbose=False,read_csv_kwds=read_csv_kwds)

### TIME SERIES
[docs]def load_ts_baltimore_crime_counts(read_csv_kwds={}):
    url="https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/baltimore_crime_counts_2014-2019.csv"
    return  read_csv_from_url(url, verbose=False,read_csv_kwds=read_csv_kwds)


[docs]def load_ts_mintemp(verbose=False,read_csv_kwds={}):
    """Loads min_temp.csv from """
    if verbose:
        print("From Introduction to Time Series")
    url='https://raw.githubusercontent.com/jirvingphd/dsc-introduction-to-time-series-online-ds-ft-100719/master/min_temp.csv'
    return  read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)


[docs]def load_ts_nyse_monthly(verbose=False,read_csv_kwds={}):
    """Loads NYSE_.csv from """
    if verbose:
        print("From Introduction to Time Series")
    url='https://raw.githubusercontent.com/jirvingphd/dsc-introduction-to-time-series-online-ds-ft-100719/master/NYSE_monthly.csv'
    return  read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)


[docs]def load_ts_exch_rates(verbose=False,read_csv_kwds={}):
    # if verbose:
    url="https://raw.githubusercontent.com/jirvingphd/dsc-basic-time-series-models-online-ds-ft-100719/master/exch_rates.csv"
    return read_csv_from_url(url, verbose=verbose, read_csv_kwds=read_csv_kwds)


[docs]def load_ts_google_trends(read_csv_kwds={}):
    url='https://raw.githubusercontent.com/jirvingphd/dsc-corr-autocorr-in-time-series-online-ds-ft-100719/master/google_trends.csv'
    return read_csv_from_url(url,verbose=False, read_csv_kwds=read_csv_kwds)


[docs]def load_ts_winning_400m(read_csv_kwds={}):
    url="https://raw.githubusercontent.com/jirvingphd/dsc-arma-models-lab-online-ds-ft-100719/master/winning_400m.csv"
    return read_csv_from_url(url,verbose=False, read_csv_kwds=read_csv_kwds)


[docs]def load_ts_std_cases(read_csv_kwds={}):
    url = 'https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/STD%20Cases.csv'
    return read_csv_from_url(url,verbose=False, read_csv_kwds=read_csv_kwds)

[docs]def load_ts_american_sex_frequency(read_csv_kwds={}):
    url = 'https://raw.githubusercontent.com/jirvingphd/fsds_100719/master/fsds_100719/data/Americans%20Sex%20Frequency.xlsx'
    import pandas as pd
    
    return pd.read_excel(url,**read_csv_kwds)
    # return read_csv_from_url(url,verbose=False, read_csv_kwds=read_csv_kwds)

# def load_ts_co2(read_csv_kwds={}):
#     import statsmodels.api as sm
#     df = sm.datasets.co2.load()
#     return df


[docs]def load_AB_multiple_choice(verbose=False,read_csv_kwds={}):
    url='https://raw.githubusercontent.com/jirvingphd/dsc-in-depth-ab-testing-lab-online-ds-pt-100719/master/multipleChoiceResponses_cleaned.csv'
    df = read_csv_from_url(url, verbose=verbose,read_csv_kwds=read_csv_kwds)#pd.read_csv(url_iowa_raw,index_col=0)

    if verbose:
        from IPython.display import display
        display(df.head())
        
    return df