Giter Site home page Giter Site logo

football_stats_scraper's Introduction

Football Data Scraper

This was built as a tool to pull out Fantasy and other Stats from the below websites. Code is not perfect, optimized or perfectly annotated - but hopefully is is helpful! The extractions were used to build an interactive Qliksense application.

QS App Example

# Full Real Stats

from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np

# set url and year. maxp is # of players to load from table
url = 'https://www.pro-football-reference.com'
year = 2019
maxp = 300

# grab fantasy players. First block grabs urls from table
r = requests.get(url + '/years/' + str(year) + '/fantasy.htm')
soup = BeautifulSoup(r.content, 'html.parser')
parsed_table = soup.find_all('table')[0]

df = []
tdf = []
# r_img = requests.get('https://www.pro-football-reference.com/players/M/McCaCh01.htm')
# soup_img = BeautifulSoup(r_img.content, 'html.parser')
# parsed_img= soup_img.find_all('img')[1]
# url= parsed_img.get('src')
# print(url)

# first 2 rows are col headers
for i, row in enumerate(parsed_table.find_all('tr')[2:]):
    if i % 10 == 0:
        print(i, end=' \n')
    if i >= maxp:
        print('\nComplete.')
        break

    try:
        # stub is the piece of URL for each player
        dat = row.find('td', attrs={'data-stat': 'player'})
        name = dat.a.get_text()
        stub = dat.a.get('href')
        stub = stub[:-4] + '/fantasy/' + str(year)
        #print(url + stub)
        # find position of player
        pos = row.find('td', attrs={'data-stat': 'fantasy_pos'}).get_text()
        # read in html - our table is the first table in the webpage
        tdf = pd.read_html(url + stub)[0]
        # grab the correct column index for the columns we want
        tdf.columns = tdf.columns.get_level_values(-1)
        print(len(tdf.columns.tolist()))
        print(tdf.columns.tolist())
        #r_img = requests.get(url + stub)
        #soup_img = BeautifulSoup(r_img.content, 'html.parser')

        # setup for dynamic renaming of columns
        column_list = []
        # below grabs the html info from the table. We use this to get column names and also image url
        r2 = requests.get(url+stub)
        soup2 = BeautifulSoup(r2.content, 'html.parser')
        parsed_img = soup2.find_all('img')[1]
        # these pieces are finding the corrent html piece we need. takes some sleuthing
        parsed_table2 = soup2.find_all('table')[0]
        table = parsed_table2.find_all('tr')[2]

        # print(table)
        for th in table.find_all('th')[0:8]:
            # print(th)
            # header='DESC'
            # colname=
            column_name = th.get_text()
            column_list.append(column_name)
            # print(column_name)

        for th in table.find_all('th')[8:]:
            # print(th)
            if 'in_10' in th['data-stat']:
                top_head = 'Inside 10_'
            else:
                top_head = ''
            header = th['data-over-header']
            # print(len(header))
            colname = th.get_text()
            column_name = top_head+header+'_'+colname

            column_list.append(column_name)
            print(column_name)
        print(len(column_list))
        # print(column_list)
        #tdf['img_URL']   = parsed_img.get('src')


#         # fix the away/home column

        # drop all intermediate stats

        # drop "Total" row
        tdf = tdf.query('Date != "Total"')

        # add other info

#         parsed_img= soup_img.find_all('img')[1]
#         #print(parsed_img)
#         tdf['img_URL']= parsed_img.get('src')
        tdf.columns = column_list
        tdf['Name'] = name
        tdf['Position'] = pos
        tdf['Season'] = year
        tdf = tdf.rename(columns={'': 'Away'})
        tdf['Away'] = [1 if r == '@' else 0 for r in tdf['Away']]
#         #print(tdf.columns)
        tdf['img_URL'] = parsed_img.get('src')
        # print(tdf.columns)
        # print(tdf.columns)
        df.append(tdf)

    except:
        pass
df = pd.concat(df, sort=False)
# [x.replace(",", "").replace(".", "").replace(" ","")
#  for line in file for word in line]
df.to_csv('fantasy2019_1.csv')
# grab fantasy players
r = requests.get(url + '/years/' + str(year) + '/fantasy.htm')
soup = BeautifulSoup(r.content, 'html.parser')
parsed_table = soup.find_all('table')[0]

df = []
tdf = []
# r_img = requests.get('https://www.pro-football-reference.com/players/M/McCaCh01.htm')
# soup_img = BeautifulSoup(r_img.content, 'html.parser')
# parsed_img= soup_img.find_all('img')[1]
# url= parsed_img.get('src')
# print(url)

# first 2 rows are col headers
for i, row in enumerate(parsed_table.find_all('tr')[2:]):
    if i % 10 == 0:
        print(i, end=' \n')
    if i >= maxp:
        print('\nComplete.')
        break

    try:
        dat = row.find('td', attrs={'data-stat': 'player'})
        name = dat.a.get_text()
        stub = dat.a.get('href')
        stub = stub[:-4] + '/gamelog/' + str(year)
        #print(url + stub)
        pos = row.find('td', attrs={'data-stat': 'fantasy_pos'}).get_text()
        tdf = pd.read_html(url + stub)[0]
        tdf.columns = tdf.columns.get_level_values(-1)
        # print(len(tdf.columns.tolist()))
        # print(tdf.columns.tolist())
        #r_img = requests.get(url + stub)
        #soup_img = BeautifulSoup(r_img.content, 'html.parser')

        # setup for dynamic renaming of columns
        column_list = []
        r2 = requests.get(url+stub)
        soup2 = BeautifulSoup(r2.content, 'html.parser')
        parsed_img = soup2.find_all('img')[1]
        parsed_table2 = soup2.find_all('table')[0]
        table = parsed_table2.find_all('tr')[1]

        # print(table)
        for th in table.find_all('th')[0:10]:
            # print(th)
            # header='DESC'
            # colname=
            column_name = th.get_text()
            column_list.append(column_name)
            # print(column_name)

        for th in table.find_all('th')[10:]:
            # print(th)
            header = th['data-over-header']
            # print(len(header))
            colname = th.get_text()
            column_name = top_head+header+'_'+colname

            column_list.append(column_name)
            # print(column_name)
        # print(len(column_list))
        # print(column_list)
        #tdf['img_URL']   = parsed_img.get('src')


#         # fix the away/home column

         # drop all intermediate stats

        # drop "Total" row
        tdf = tdf.query('Date != "Total"')

        # add other info
        tdf = tdf.rename(columns={'Unnamed: 6_level_1': 'Away'})
#         parsed_img= soup_img.find_all('img')[1]
#         #print(parsed_img)
#         tdf['img_URL']= parsed_img.get('src')
        tdf = tdf.loc[:, ~tdf.columns.str.contains('Unnamed', case=False)]
        tdf = tdf[~tdf.Date.str.contains("Games")]
        # print(tdf.columns)
        tdf.columns = column_list
        tdf = tdf.rename(columns={'': 'Away'})
        tdf['Name'] = name
        tdf['Position'] = pos
        tdf['Season'] = year

        tdf['Away'] = [1 if r == '@' else 0 for r in tdf['Away']]
#         #print(tdf.columns)
        tdf['img_URL'] = parsed_img.get('src')
        tdf = tdf.loc[:, ~tdf.columns.duplicated()]
        # print(tdf.columns)

        df.append(tdf)

    except:
        pass
df = pd.concat(df, sort=False)
# [x.replace(",", "").replace(".", "").replace(" ","")
#  for line in file for word in line]
df.to_csv('stats2019_1.csv')

Below Code was not used - it was extracting using the YT API. I found a simpler method in further below code snippets

from apiclient.discovery import build
from apiclient.errors import HttpError
from oauth2client.tools import argparser

DEVELOPER_KEY = "AIzaSyDUeDeyY6TROtz_ANdy_JlmrecjEueIAKk"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"


def youtube_search(q, max_results=50, order="relevance", token=None, location=None, location_radius=None):

    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)

    search_response = youtube.search().list(
        q=q,
        type="video",
        pageToken=token,
        order=order,
        part="id,snippet",
        maxResults=max_results,
        location=location,
        locationRadius=location_radius

    ).execute()

    videos = []

    for search_result in search_response.get("items", []):
        if search_result["id"]["kind"] == "youtube#video":
            videos.append(search_result)
    try:
        nexttok = search_response["nextPageToken"]
        return(nexttok, videos)
    except Exception as e:
        nexttok = "last_page"
        return(nexttok, videos)


def geo_query(video_id):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)

    video_response = youtube.videos().list(
        id=video_id,
        part='snippet, recordingDetails, statistics'

    ).execute()

    return video_response
import sys

#from youtube_videos import youtube_search
import pandas as pd
import json


vid_title = []
vid_id = []

for name in unique_players[0]:

    test = youtube_search(name)
    just_json = test[1]

    video = just_json[0]
    #     print (video['snippet']['title'])
    #     print (video['id']['videoId'])
    player_name = name
    vid_title.append(video['snippet']['title'])
    vid_id.append(video['id']['videoId'])

# token = test[0]
# youtube_search('nfl', token=token)
#res = youtube_search(keyword, token=token)


# def grab_videos(keyword, token=None):
#     res = youtube_search(keyword, token=token)
#     token = res[0]
#     videos = res[1]
#     for vid in range(0,2):
#         video_dict['youID'].append(vid['id']['videoId'])
#         video_dict['title'].append(vid['snippet']['title'])
#         video_dict['pub_date'].append(vid['snippet']['publishedAt'])
#     print ("added " + str(len(videos)) + " videos to a total of " + str(len(video_dict['youID'])))
#     return token


# token = grab_videos("nfl")
# while token != "last_page":
#     token = grab_videos("nfl", token=token)

This Set of Code first referencs the above code to simply pull player names. I then pull out values from Youtube searches for image URLs

# Full Real Stats

from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np

# set url and year. maxp is # of players to load from table
url = 'https://www.pro-football-reference.com'
year = 2019
maxp = 300

# grab fantasy players. First block grabs urls from table
r = requests.get(url + '/years/' + str(year) + '/fantasy.htm')
soup = BeautifulSoup(r.content, 'html.parser')
parsed_table = soup.find_all('table')[0]

df = []
tdf2 = []
# r_img = requests.get('https://www.pro-football-reference.com/players/M/McCaCh01.htm')
# soup_img = BeautifulSoup(r_img.content, 'html.parser')
# parsed_img= soup_img.find_all('img')[1]
# url= parsed_img.get('src')
# print(url)

# first 2 rows are col headers
for i, row in enumerate(parsed_table.find_all('tr')[2:]):
    if i % 10 == 0:
        print(i, end=' \n')
    if i >= maxp:
        print('\nComplete.')
        break

    try:
        # stub is the piece of URL for each player
        dat = row.find('td', attrs={'data-stat': 'player'})
        name = dat.a.get_text()
        stub = dat.a.get('href')
        stub = stub[:-4] + '/fantasy/' + str(year)
        tdf2.append(name)

    except:
        pass

This Code pulls out team Logos. I slightly updated the code for players to pull out the image URLs. There is probably a better way - It was broken so I manually added a few to my file (I got Lazy!)

from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
# grab URL for logos. First block grabs urls from table
r = requests.get('https://teamcolorcodes.com/nfl-team-color-codes')
soup = BeautifulSoup(r.content, 'html.parser')
parsed_table = soup.find_all('p')[2]
maxp = 40
tm_name = []
tm_url = []
img_url = []

for i, row in enumerate(parsed_table.find_all('a')):
    if i % 10 == 0:
        print(i, end=' \n')
    if i >= maxp:
        print('\nComplete.')
        break

    try:

        name = row.get_text()
        stub = row.get('href')
        # print(stub)
        r2 = requests.get(stub)
        soup2 = BeautifulSoup(r2.content, 'html.parser')
        parsed_img = soup2.find_all('p')[5]
        url = parsed_img.a.get('href')
        # print(url)
        tm_name.append(name)
        tm_url.append(stub)
        img_url.append(url)
    except:
        pass

Logic below is to pull out Youtube search info and then extract the video links which I use in the app

import urllib.request
from bs4 import BeautifulSoup


vid_id = []
# vid_id5=[]
for i, name in enumerate(tdf2):
    print(name, ' ', i)
    textToSearch = name+'2019 Highlights +Sports Productions'
    query = urllib.parse.quote(textToSearch)
    url = "https://www.youtube.com/results?search_query=" + query
    response = urllib.request.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')
    # for vid in soup.findAll(attrs={'class':'yt-uix-tile-link'}):
    #      print('https://www.youtube.com' + vid['href'])
    id_url = soup.findAll(attrs={'class': 'yt-uix-tile-link'})[1]
    url_new = id_url['href']
    vid_id.append(url_new)

Here we simply match player name to video link

vid_id2 = []
for i in vid_id:
    vid_id2.append(i[9:])
#vid_id2 = list(dict.fromkeys(vid_id2))
unique_name = pd.DataFrame()
unique_name['players'] = tdf2
unique_name['id'] = pd.Series(vid_id2)
unique_tm = df.Tm.unique()
unique_team = pd.DataFrame()
unique_team['name'] = unique_tm

football_stats_scraper's People

Contributors

bbuxton93 avatar

Watchers

 avatar

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.