import requests |
from bs4 import BeautifulSoup |
def gol_scraper(url_df, gol_code_colname) : |
## creating the list of variables we are looking to scrape |
list_of_vars = ['Team' , |
'Region' , |
'Season' , |
'Winrate' , |
'Avg_game_dur' , |
'GPM' , |
'GDiff_per_min' , |
'GDiff_at_15' , |
'CS_per_min' , |
'CSDiff_at_15' , |
'Tower_diff_at_15' , |
'Tower_ratio' , |
'First_tower' , |
'DMG_per_min' , |
'First_blood' , |
'Kills_per_game' , |
'Death_per_game' , |
'Kill_per_death_ratio' , |
'Avg_assists_per_kills' , |
'Dragons_per_game' , |
'Dragons_at_15' , |
'Nashors_per_game' , |
'Wards_per_min' , |
'Vision_wards_per_min' , |
'Wards_cleared_per_minute' , |
'Wards_cleared'] |
## creating empty dataframe of the listed variables |
df = pd.DataFrame(columns = list_of_vars) |
## iterate through all the teams and seasons to query the data |
for x in range(len(url_df)) : |
try : ## in case there is no data in the webpage, try and except is neccesairy, so the function is not running into error |
## creating raw html query into by replacing the parameter in the url each time |
URL = 'https://gol.gg/teams/team-stats/{}/split-Summer/tournament-ALL/'.format(url_df.loc[x, gol_code_colname]) |
page = requests.get(URL) |
soup = BeautifulSoup(page.content, 'html.parser') |
all_data_html = soup.find_all(class_ = 'col-12 col-sm-6 rowbreak pb-4') ## this is the part of the page that contains all the stats |
d = {} |
## putting the different segments of the page into dict |
for i in range(len(all_data_html)) : |
tl = soup.find_all(class_ = 'col-12 col-sm-6 rowbreak pb-4')[i].get_text() |
tl_clean = tl.replace ('nn', ') |
tl_list = list(tl_clean.split ('n')) |
d['list' + str(i)] = tl_list |
## these lists have to be further splitted to substrings |
list_of_lists_to_further_splits = [3, 4, 5, 6] |
for j in list_of_lists_to_further_splits : |
helper_list = [] |
for i in range(len(d['list' + str(j)])) : |
helper_string = d['list' + str(j)][i] |
splited_list = helper_string.split(':') |
helper_list.extend(splited_list) |
d['list' + str(j)] = helper_list |
helper_list_2 = [] |
## section 5 still has to be further splitted into substrings |
for k in range(len(d['list5'])) : |
helper_string_2 = d['list5'][k] |
splited_list_2 = helper_string_2.split('(') |
helper_list_2.extend(splited_list_2) |
## cleaning the substrings and parsing them into variables |
d['list5'] = helper_list_2 |
df.loc[x , 'Team'] = url_df.loc[x , 'Team'] |
df.loc[x , 'Region'] = d['list0'][1] |
df.loc[x , 'Season'] = d['list0'][3] |
df.loc[x , 'Winrate'] = d['list0'][7].replace(' %Average game duration : ' , ') ## later turn it into % !! |
df.loc[x , 'Avg_game_dur'] = d['list0'][8] ## later turn it into secs !! |
df.loc[x , 'GPM'] = d['list3'][2].replace('Gold Differential per Minute', ') |
df.loc[x , 'GDiff_per_min'] = d['list3'][3].replace('Gold Differential at 15 min' , ') |
df.loc[x , 'GDiff_at_15'] = d['list3'][4].replace('CS Per Minute' , ') |
df.loc[x , 'CS_per_min'] = d['list3'][5].replace('CS Differential at 15 min' , ') |
df.loc[x , 'CSDiff_at_15'] = d['list3'][6].replace('Tower Differential at 15 min' , ') |
df.loc[x , 'Tower_diff_at_15'] = d['list3'][7].replace('Tower Ratio' , ') |
df.loc[x , 'Tower_ratio'] = d['list3'][8].replace('First Tower ' , ') |
df.loc[x , 'First_tower'] = d['list3'][11].replace('%' , ') ## later turn it into % !! |
df.loc[x , 'DMG_per_min'] = d['list4'][2].replace('First Blood' , ') |
df.loc[x , 'First_blood'] = d['list4'][5].replace('%' , ') ## later turn it into % !! |
df.loc[x , 'Kills_per_game'] = d['list4'][7].replace('Deaths Per Game' , ') |
df.loc[x , 'Death_per_game'] = d['list4'][8].replace('Kill / Death Ratio' , ') |
df.loc[x , 'Kill_per_death_ratio'] = d['list4'][9].replace('Average Assists / Kill' , ') |
df.loc[x , 'Avg_assists_per_kills'] = d['list4'][10] |
df.loc[x , 'Dragons_per_game'] = d['list5'][2].replace(' ' , ') |
df.loc[x , 'Dragons_at_15'] = d['list5'][4].replace('Herald / game' , ') |
df.loc[x , 'Nashors_per_game'] = d['list5'][7].replace(' ' , ') |
df.loc[x , 'Wards_per_min'] = d['list6'][2].replace('Vision Wards Per Minute' , ') |
df.loc[x , 'Vision_wards_per_min'] = d['list6'][3].replace('Wards Cleared Per Minute' , ') |
df.loc[x , 'Wards_cleared_per_minute'] = d['list6'][4].replace('% Wards Cleared' , ') |
df.loc[x , 'Wards_cleared'] = d['list6'][7].replace(' %' , ') ## later turn it into % !! |
except : |
pass |
return (df) |