Source code for fsds_100719.learn_scrape

import requests
import time
from time import sleep
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import pandas as pd
import numpy as  np

[docs]def start_driver(url = 'https://instruction.learn.co/staff/students'): from selenium import webdriver driver = webdriver.Chrome() driver.get(url) time.sleep(1) return driver
[docs]def load_login_data(login_data_file = "/Users/jamesirving/.secret/learn_login.json", verbose=True): """Loads in json file from path""" with open(login_data_file,'r+') as f: import json fdata= f.read() login_data = json.loads(fdata) if verbose: print("Loaded json data. Keys:") print(login_data.keys()) return login_data
[docs]def github_login(driver,login_data=None): """Logs into GitHub Account (for instruction.learn) url = 'https://instruction.learn.co/staff/students' """ if login_data is None: login_data= load_login_data() username = driver.find_element_by_xpath('//*[@id="login_field"]') username.send_keys(login_data['username']) password = driver.find_element_by_xpath('//*[@id="password"]') password.send_keys(login_data['password']) sign_in = driver.find_element_by_xpath('//*[@id="login"]/form/div[2]/input[8]') sign_in.click()
[docs]def instruct_menu_to_cohort_roster(driver,cohort="pt"): import time time.sleep(0.5) cohort_lead =driver.find_element_by_xpath('/html/body/div[1]/nav/div[1]/ul/li[1]/a') my_cohorts = driver.find_element_by_xpath('//*[@id="js-parentDropdownLink"]') if cohort=="pt": cohort_link = driver.find_element_by_xpath('//*[@id="js-childrenList-141"]/ul/li[1]/a') elif cohort=="ft": cohort_link = driver.find_element_by_xpath('//*[@id="js-sidenavChildrenList-140"]/li[2]/a') # return ft_cohort actions = ActionChains(driver) actions.move_to_element(cohort_lead) actions.pause(.5) actions.click(my_cohorts) actions.pause(.5) actions.click(cohort_link) return actions.perform()
[docs]def cohort_driver_to_csv(driver,output_file='cohort_output.csv', debug=False,load=False, load_kws=None): """Exports the table content inside of the driver.page_source to csv file. Args: driver (WebDriver): cohort instruct page's driver output_file (str): name of csv file to save. TO DO: Add link extraction""" my_html = driver.page_source soup = BeautifulSoup(my_html, 'html.parser') table = soup.find("table") rows = table.find_all('tr') output_rows = [] for row in rows: row_text = row.get_text(separator='\t',strip=True) if "Links" in row_text: row_text=row_text.replace("\tLinks",' ') profile_links = [x['href'] for x in row.find_all('a')]# if debug: print(len(row_text.split('t'))) if 'John' in row_text: print("Returning John row object") return row repl_dict={ ':':' ', # ')':' ', '\n':' ' } for k,v in repl_dict.items(): row_text = row_text.replace(k,v) # row_text = row_text.replace(':',' ').replace(')',' ').replace('\n',' ') output_rows.append(row_text)#row.get_text(separator='\t',strip=True)) with open(output_file, 'w+') as csvfile: csvfile.write('\n'.join(output_rows)) print(f"[i] Successfully saved '{output_file}'") if load: # header = pd.read_csv(output_file,delimiter='\t',nrows=1) if load_kws is not None: df = pd.read_csv(output_file,delimiter='\t',**load_kws) else: df = pd.read_csv(output_file,delimiter='\t') ## Save column names to restore # cols = df.columns df.reset_index(inplace=True) cols = df.drop('index',axis=1).columns if df["Completed Lessons"].isna().any(): shift_index = df.loc[(df['Completed Lessons'].isna())].index#.copy() # ## Preview bad row alignment # display(df.loc[shift_index]) ## Replace the column data to match others cols_to_swap = {"Completed Lessons":"Last Checkin Note", "Instructor":"Checkins (NoShows)", "Checkins (NoShows)":"Last Checkin Note"} for bad_col,good_col in cols_to_swap.items(): # df.loc[shift_index,'Completed Lessons']=df.loc[shift_index,'Last Checkin Note'].copy() df.loc[shift_index,bad_col]=df.loc[shift_index,good_col].copy() df.loc[shift_index,"Last Checkin Note"]=np.nan # ##Preview changes # display(df.loc[shift_index]) # Drop one of the redundant columns drop_col = "Completed Lessons"#'Last Checkin Note' df.drop(columns=[drop_col],inplace=True) # Restore names to columns df.columns = cols return df
[docs]def help(): print("[i] Workflow:") print("driver = start_driver()") print('login_data=load_login_data()') print("github_login(driver,login_data)") print("instruct_menu_to_cohort_roster(driver,cohort='pt')") print("df = cohort_driver_to_csv(driver,'pt_cohort_data.csv',load=True)")
help()