Scraping NHL Individual stats

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
6
down vote

favorite

With the idea of gathering the biggest hockey's individual stats dataset ever, I've started with the big league, the NHL.

Using Selenium, I've been able to scrape this NHL's statistics page.

To explain the workflow simply, I extract the data from the standing's table. It's basically a big string with a lots of "n" to split the data. Then I "click" on the next button, redo, until the whole list is passed. Afterwards I change the NHL season with the URL and redo the steps above.

Everything works fine, but I have some concerns about readability/maintainability.

from selenium import webdriver
from selenium.webdriver.support.ui import Select
import math
import csv
import os


csv_header = ["#", "Player", "Season", "Team", "Pos", "GP", "G", "A", "P", "+/-", "PIM", "P/GP", "PPG",
 "PPP", "SHG", "SHP", "GWG", "OTG",
 "S", "S%", "TOI/GP", "Shifts/GP", "FOW%"]


def scrape_nhl_standings(csv_dump_path,start_year, end_year):
 try:
 driver = webdriver.Chrome()

 for year in range(start_year, end_year + 1):
 driver.get(build_url(year))

 # There is a weird bug on the NHL website where the request for 07-08 season also returns the 06-07 so we set it back.
 if year == 2007:
 year_range = Select(driver.find_element_by_class_name("filter__range").find_element_by_tag_name("select"))
 year_range.select_by_value("20072008")
 driver.find_element_by_class_name("go").click()

 # Force Selenium to wait for the page to reload
 driver.implicitly_wait(5)

 # We save about 2 seconds by setting the page size to 100.
 set_page_size_to_100(driver)

 players_standings = 
 next_page_button = driver.find_element_by_class_name("-next").find_element_by_tag_name("button")

 while next_page_button.get_attribute("disabled") is None:
 table_standings_page = driver.find_element_by_class_name("rt-tbody")
 players_standings += parse_standings_page(table_standings_page.text)
 next_page_button.click()

 write_to_csv(csv_dump_path, players_standings, year)
 print("Finished season 0-1".format(year, year+1))

 finally:
 # noinspection PyUnboundLocalVariable
 driver.close()


def build_url(seasons_start_year):
 year_string = str(seasons_start_year) + str(seasons_start_year+1)

 return "http://www.nhl.com/stats/player?reportType=season" 
 "&seasonFrom=0" 
 "&seasonTo=0" 
 "&gameType=2" 
 "&filter=gamesPlayed,gte,1" 
 "&sort=points,goals,assists".format(year_string)


def set_page_size_to_100(driver):
 page_size_dropdown = Select(driver
 .find_element_by_class_name("-pageSizeOptions")
 .find_element_by_tag_name("select"))

 page_size_dropdown.select_by_value("100")


def parse_standings_page(standings):
 players_standings = 
 cells_per_row = 23
 cells = standings.split('n')

 # There's a problem with markup here, below // isn't a comment
 rows_count = len(cells) // cells_per_row

 if not rows_count - math.floor(rows_count) == 0:
 raise ValueError("Cells count isn't divisible by cells per row.")

 for i in range(0,int(rows_count)):

 row = cells[i * 23: (i + 1) * 23]

 row[0] = int(row[0]) # standing
 row[5] = int(row[5]) # Game Played
 row[6] = int(row[6]) # Goals
 row[7] = int(row[7]) # Assists
 row[8] = int(row[8]) # Points
 row[9] = int(row[9]) # Plus/Minus
 row[10] = int(row[10]) # PIM
 row[11] = try_parse_float(row[11]) # P/GP
 row[12] = int(row[12]) # PPG
 row[13] = int(row[13]) # PPP
 row[14] = int(row[14]) # SHG
 row[15] = int(row[15]) # SHP
 row[16] = int(row[16]) # GWG
 row[17] = int(row[17]) # OTG
 row[18] = int(row[18]) # Shots
 row[19] = try_parse_float(row[19]) # Shot %
 row[21] = try_parse_float(row[21]) # Shifts/GP
 row[22] = try_parse_float(row[22]) # FOW%

 players_standings.append(row)

 return players_standings


def try_parse_float(x):
 return float(x) if not x == "--" else 0


def write_to_csv(csv_dump_path, players_standings, year):

 with open(csv_dump_path+"0-1 NHL Standings.csv".format(year, year+1), "w+") as csvfile:
 csvwriter = csv.writer(csvfile, delimiter=",")
 csvwriter.writerow(csv_header)

 for row in players_standings:
 csvwriter.writerow(row)


if __name__ == "__main__":

 csv_path = os.path.dirname(__file__)+"/../data/"

 if not os.path.exists(csv_path):
 os.makedirs(csv_path)

 scrape_nhl_standings(csv_path, start_year=2007, end_year=2007)

My main concerns are :

The parse_standings_page function. For each cell I need to parse to int or float some values. Sometimes, if a player didn't have a single point, the Points Per Game Played (P/GP) stat will contain "--". This makes for really ugly code, but I can't seem to find a simple way to parse all these values simply. Plus, using indexer for each cells makes it not so readable and it's easy to mess up, I'd like to have a more verbose approach.

General naming conventions. The main goal of this script is to call scrape_nhl_standings, should the other methods start with _ to indicate they are "private"?

Of course I'm 100% open to anything else that would improve my code.

edited Apr 27 at 15:48

Peilonrayz

24.3k336102

asked Apr 27 at 15:32

IEatBagels

8,50623077

add a commentÂ |Â

up vote
6
down vote

favorite

With the idea of gathering the biggest hockey's individual stats dataset ever, I've started with the big league, the NHL.

Using Selenium, I've been able to scrape this NHL's statistics page.

Everything works fine, but I have some concerns about readability/maintainability.

from selenium import webdriver
from selenium.webdriver.support.ui import Select
import math
import csv
import os


csv_header = ["#", "Player", "Season", "Team", "Pos", "GP", "G", "A", "P", "+/-", "PIM", "P/GP", "PPG",
 "PPP", "SHG", "SHP", "GWG", "OTG",
 "S", "S%", "TOI/GP", "Shifts/GP", "FOW%"]


def scrape_nhl_standings(csv_dump_path,start_year, end_year):
 try:
 driver = webdriver.Chrome()

 for year in range(start_year, end_year + 1):
 driver.get(build_url(year))

 # There is a weird bug on the NHL website where the request for 07-08 season also returns the 06-07 so we set it back.
 if year == 2007:
 year_range = Select(driver.find_element_by_class_name("filter__range").find_element_by_tag_name("select"))
 year_range.select_by_value("20072008")
 driver.find_element_by_class_name("go").click()

 # Force Selenium to wait for the page to reload
 driver.implicitly_wait(5)

 # We save about 2 seconds by setting the page size to 100.
 set_page_size_to_100(driver)

 players_standings = 
 next_page_button = driver.find_element_by_class_name("-next").find_element_by_tag_name("button")

 while next_page_button.get_attribute("disabled") is None:
 table_standings_page = driver.find_element_by_class_name("rt-tbody")
 players_standings += parse_standings_page(table_standings_page.text)
 next_page_button.click()

 write_to_csv(csv_dump_path, players_standings, year)
 print("Finished season 0-1".format(year, year+1))

 finally:
 # noinspection PyUnboundLocalVariable
 driver.close()


def build_url(seasons_start_year):
 year_string = str(seasons_start_year) + str(seasons_start_year+1)

 return "http://www.nhl.com/stats/player?reportType=season" 
 "&seasonFrom=0" 
 "&seasonTo=0" 
 "&gameType=2" 
 "&filter=gamesPlayed,gte,1" 
 "&sort=points,goals,assists".format(year_string)


def set_page_size_to_100(driver):
 page_size_dropdown = Select(driver
 .find_element_by_class_name("-pageSizeOptions")
 .find_element_by_tag_name("select"))

 page_size_dropdown.select_by_value("100")


def parse_standings_page(standings):
 players_standings = 
 cells_per_row = 23
 cells = standings.split('n')

 # There's a problem with markup here, below // isn't a comment
 rows_count = len(cells) // cells_per_row

 if not rows_count - math.floor(rows_count) == 0:
 raise ValueError("Cells count isn't divisible by cells per row.")

 for i in range(0,int(rows_count)):

 row = cells[i * 23: (i + 1) * 23]

 row[0] = int(row[0]) # standing
 row[5] = int(row[5]) # Game Played
 row[6] = int(row[6]) # Goals
 row[7] = int(row[7]) # Assists
 row[8] = int(row[8]) # Points
 row[9] = int(row[9]) # Plus/Minus
 row[10] = int(row[10]) # PIM
 row[11] = try_parse_float(row[11]) # P/GP
 row[12] = int(row[12]) # PPG
 row[13] = int(row[13]) # PPP
 row[14] = int(row[14]) # SHG
 row[15] = int(row[15]) # SHP
 row[16] = int(row[16]) # GWG
 row[17] = int(row[17]) # OTG
 row[18] = int(row[18]) # Shots
 row[19] = try_parse_float(row[19]) # Shot %
 row[21] = try_parse_float(row[21]) # Shifts/GP
 row[22] = try_parse_float(row[22]) # FOW%

 players_standings.append(row)

 return players_standings


def try_parse_float(x):
 return float(x) if not x == "--" else 0


def write_to_csv(csv_dump_path, players_standings, year):

 with open(csv_dump_path+"0-1 NHL Standings.csv".format(year, year+1), "w+") as csvfile:
 csvwriter = csv.writer(csvfile, delimiter=",")
 csvwriter.writerow(csv_header)

 for row in players_standings:
 csvwriter.writerow(row)


if __name__ == "__main__":

 csv_path = os.path.dirname(__file__)+"/../data/"

 if not os.path.exists(csv_path):
 os.makedirs(csv_path)

 scrape_nhl_standings(csv_path, start_year=2007, end_year=2007)

My main concerns are :

The parse_standings_page function. For each cell I need to parse to int or float some values. Sometimes, if a player didn't have a single point, the Points Per Game Played (P/GP) stat will contain "--". This makes for really ugly code, but I can't seem to find a simple way to parse all these values simply. Plus, using indexer for each cells makes it not so readable and it's easy to mess up, I'd like to have a more verbose approach.

General naming conventions. The main goal of this script is to call scrape_nhl_standings, should the other methods start with _ to indicate they are "private"?

Of course I'm 100% open to anything else that would improve my code.

edited Apr 27 at 15:48

Peilonrayz

24.3k336102

asked Apr 27 at 15:32

IEatBagels

8,50623077

add a commentÂ |Â

up vote
6
down vote

favorite

With the idea of gathering the biggest hockey's individual stats dataset ever, I've started with the big league, the NHL.

Using Selenium, I've been able to scrape this NHL's statistics page.

Everything works fine, but I have some concerns about readability/maintainability.

from selenium import webdriver
from selenium.webdriver.support.ui import Select
import math
import csv
import os


csv_header = ["#", "Player", "Season", "Team", "Pos", "GP", "G", "A", "P", "+/-", "PIM", "P/GP", "PPG",
 "PPP", "SHG", "SHP", "GWG", "OTG",
 "S", "S%", "TOI/GP", "Shifts/GP", "FOW%"]


def scrape_nhl_standings(csv_dump_path,start_year, end_year):
 try:
 driver = webdriver.Chrome()

 for year in range(start_year, end_year + 1):
 driver.get(build_url(year))

 # There is a weird bug on the NHL website where the request for 07-08 season also returns the 06-07 so we set it back.
 if year == 2007:
 year_range = Select(driver.find_element_by_class_name("filter__range").find_element_by_tag_name("select"))
 year_range.select_by_value("20072008")
 driver.find_element_by_class_name("go").click()

 # Force Selenium to wait for the page to reload
 driver.implicitly_wait(5)

 # We save about 2 seconds by setting the page size to 100.
 set_page_size_to_100(driver)

 players_standings = 
 next_page_button = driver.find_element_by_class_name("-next").find_element_by_tag_name("button")

 while next_page_button.get_attribute("disabled") is None:
 table_standings_page = driver.find_element_by_class_name("rt-tbody")
 players_standings += parse_standings_page(table_standings_page.text)
 next_page_button.click()

 write_to_csv(csv_dump_path, players_standings, year)
 print("Finished season 0-1".format(year, year+1))

 finally:
 # noinspection PyUnboundLocalVariable
 driver.close()


def build_url(seasons_start_year):
 year_string = str(seasons_start_year) + str(seasons_start_year+1)

 return "http://www.nhl.com/stats/player?reportType=season" 
 "&seasonFrom=0" 
 "&seasonTo=0" 
 "&gameType=2" 
 "&filter=gamesPlayed,gte,1" 
 "&sort=points,goals,assists".format(year_string)


def set_page_size_to_100(driver):
 page_size_dropdown = Select(driver
 .find_element_by_class_name("-pageSizeOptions")
 .find_element_by_tag_name("select"))

 page_size_dropdown.select_by_value("100")


def parse_standings_page(standings):
 players_standings = 
 cells_per_row = 23
 cells = standings.split('n')

 # There's a problem with markup here, below // isn't a comment
 rows_count = len(cells) // cells_per_row

 if not rows_count - math.floor(rows_count) == 0:
 raise ValueError("Cells count isn't divisible by cells per row.")

 for i in range(0,int(rows_count)):

 row = cells[i * 23: (i + 1) * 23]

 row[0] = int(row[0]) # standing
 row[5] = int(row[5]) # Game Played
 row[6] = int(row[6]) # Goals
 row[7] = int(row[7]) # Assists
 row[8] = int(row[8]) # Points
 row[9] = int(row[9]) # Plus/Minus
 row[10] = int(row[10]) # PIM
 row[11] = try_parse_float(row[11]) # P/GP
 row[12] = int(row[12]) # PPG
 row[13] = int(row[13]) # PPP
 row[14] = int(row[14]) # SHG
 row[15] = int(row[15]) # SHP
 row[16] = int(row[16]) # GWG
 row[17] = int(row[17]) # OTG
 row[18] = int(row[18]) # Shots
 row[19] = try_parse_float(row[19]) # Shot %
 row[21] = try_parse_float(row[21]) # Shifts/GP
 row[22] = try_parse_float(row[22]) # FOW%

 players_standings.append(row)

 return players_standings


def try_parse_float(x):
 return float(x) if not x == "--" else 0


def write_to_csv(csv_dump_path, players_standings, year):

 with open(csv_dump_path+"0-1 NHL Standings.csv".format(year, year+1), "w+") as csvfile:
 csvwriter = csv.writer(csvfile, delimiter=",")
 csvwriter.writerow(csv_header)

 for row in players_standings:
 csvwriter.writerow(row)


if __name__ == "__main__":

 csv_path = os.path.dirname(__file__)+"/../data/"

 if not os.path.exists(csv_path):
 os.makedirs(csv_path)

 scrape_nhl_standings(csv_path, start_year=2007, end_year=2007)

My main concerns are :

The parse_standings_page function. For each cell I need to parse to int or float some values. Sometimes, if a player didn't have a single point, the Points Per Game Played (P/GP) stat will contain "--". This makes for really ugly code, but I can't seem to find a simple way to parse all these values simply. Plus, using indexer for each cells makes it not so readable and it's easy to mess up, I'd like to have a more verbose approach.

General naming conventions. The main goal of this script is to call scrape_nhl_standings, should the other methods start with _ to indicate they are "private"?

Of course I'm 100% open to anything else that would improve my code.

edited Apr 27 at 15:48

Peilonrayz

24.3k336102

asked Apr 27 at 15:32

IEatBagels

8,50623077

With the idea of gathering the biggest hockey's individual stats dataset ever, I've started with the big league, the NHL.

Using Selenium, I've been able to scrape this NHL's statistics page.

Everything works fine, but I have some concerns about readability/maintainability.

from selenium import webdriver
from selenium.webdriver.support.ui import Select
import math
import csv
import os


csv_header = ["#", "Player", "Season", "Team", "Pos", "GP", "G", "A", "P", "+/-", "PIM", "P/GP", "PPG",
 "PPP", "SHG", "SHP", "GWG", "OTG",
 "S", "S%", "TOI/GP", "Shifts/GP", "FOW%"]


def scrape_nhl_standings(csv_dump_path,start_year, end_year):
 try:
 driver = webdriver.Chrome()

 for year in range(start_year, end_year + 1):
 driver.get(build_url(year))

 # There is a weird bug on the NHL website where the request for 07-08 season also returns the 06-07 so we set it back.
 if year == 2007:
 year_range = Select(driver.find_element_by_class_name("filter__range").find_element_by_tag_name("select"))
 year_range.select_by_value("20072008")
 driver.find_element_by_class_name("go").click()

 # Force Selenium to wait for the page to reload
 driver.implicitly_wait(5)

 # We save about 2 seconds by setting the page size to 100.
 set_page_size_to_100(driver)

 players_standings = 
 next_page_button = driver.find_element_by_class_name("-next").find_element_by_tag_name("button")

 while next_page_button.get_attribute("disabled") is None:
 table_standings_page = driver.find_element_by_class_name("rt-tbody")
 players_standings += parse_standings_page(table_standings_page.text)
 next_page_button.click()

 write_to_csv(csv_dump_path, players_standings, year)
 print("Finished season 0-1".format(year, year+1))

 finally:
 # noinspection PyUnboundLocalVariable
 driver.close()


def build_url(seasons_start_year):
 year_string = str(seasons_start_year) + str(seasons_start_year+1)

 return "http://www.nhl.com/stats/player?reportType=season" 
 "&seasonFrom=0" 
 "&seasonTo=0" 
 "&gameType=2" 
 "&filter=gamesPlayed,gte,1" 
 "&sort=points,goals,assists".format(year_string)


def set_page_size_to_100(driver):
 page_size_dropdown = Select(driver
 .find_element_by_class_name("-pageSizeOptions")
 .find_element_by_tag_name("select"))

 page_size_dropdown.select_by_value("100")


def parse_standings_page(standings):
 players_standings = 
 cells_per_row = 23
 cells = standings.split('n')

 # There's a problem with markup here, below // isn't a comment
 rows_count = len(cells) // cells_per_row

 if not rows_count - math.floor(rows_count) == 0:
 raise ValueError("Cells count isn't divisible by cells per row.")

 for i in range(0,int(rows_count)):

 row = cells[i * 23: (i + 1) * 23]

 row[0] = int(row[0]) # standing
 row[5] = int(row[5]) # Game Played
 row[6] = int(row[6]) # Goals
 row[7] = int(row[7]) # Assists
 row[8] = int(row[8]) # Points
 row[9] = int(row[9]) # Plus/Minus
 row[10] = int(row[10]) # PIM
 row[11] = try_parse_float(row[11]) # P/GP
 row[12] = int(row[12]) # PPG
 row[13] = int(row[13]) # PPP
 row[14] = int(row[14]) # SHG
 row[15] = int(row[15]) # SHP
 row[16] = int(row[16]) # GWG
 row[17] = int(row[17]) # OTG
 row[18] = int(row[18]) # Shots
 row[19] = try_parse_float(row[19]) # Shot %
 row[21] = try_parse_float(row[21]) # Shifts/GP
 row[22] = try_parse_float(row[22]) # FOW%

 players_standings.append(row)

 return players_standings


def try_parse_float(x):
 return float(x) if not x == "--" else 0


def write_to_csv(csv_dump_path, players_standings, year):

 with open(csv_dump_path+"0-1 NHL Standings.csv".format(year, year+1), "w+") as csvfile:
 csvwriter = csv.writer(csvfile, delimiter=",")
 csvwriter.writerow(csv_header)

 for row in players_standings:
 csvwriter.writerow(row)


if __name__ == "__main__":

 csv_path = os.path.dirname(__file__)+"/../data/"

 if not os.path.exists(csv_path):
 os.makedirs(csv_path)

 scrape_nhl_standings(csv_path, start_year=2007, end_year=2007)

My main concerns are :

The parse_standings_page function. For each cell I need to parse to int or float some values. Sometimes, if a player didn't have a single point, the Points Per Game Played (P/GP) stat will contain "--". This makes for really ugly code, but I can't seem to find a simple way to parse all these values simply. Plus, using indexer for each cells makes it not so readable and it's easy to mess up, I'd like to have a more verbose approach.

General naming conventions. The main goal of this script is to call scrape_nhl_standings, should the other methods start with _ to indicate they are "private"?

Of course I'm 100% open to anything else that would improve my code.

edited Apr 27 at 15:48

Peilonrayz

24.3k336102

asked Apr 27 at 15:32

IEatBagels

8,50623077

edited Apr 27 at 15:48

Peilonrayz

24.3k336102

edited Apr 27 at 15:48

Peilonrayz

24.3k336102

edited Apr 27 at 15:48

Peilonrayz

24.3k336102

asked Apr 27 at 15:32

IEatBagels

8,50623077

asked Apr 27 at 15:32

IEatBagels

8,50623077

asked Apr 27 at 15:32

IEatBagels

8,50623077

add a commentÂ |Â

2 Answers
2

active

oldest

votes

up vote
6
down vote

accepted

+50

I will make a few points about the code as code, but the first thing to consider in any sort of data collection is respecting the rights of the owners of the data.

In particular the terms of service on the NHL site state

You may not access or use, or attempt to access or use, the Services to take any action that could harm us or any other person or entity (each a "person"), interfere with the operation of the Services, or use the Services in a manner that violates any laws. For example, you may not: ... Engage in unauthorized spidering, scraping, or harvesting of content or information, or use any other unauthorized automated means to compile information;

Therefore consider that using this script for the purpose it is written is probably illegal. If you actually set yourself up in competition to the site using data gained in this manner, you would almost guarantee yourself a lawsuit to defend.

# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row

if not rows_count - math.floor(rows_count) == 0:
 raise ValueError("Cells count isn't divisible by cells per row.")

The // here is expressly integer division, and already does the flooring. As such the error you're trying to detect cannot be raised. It could instead be written

if rows_count * cells_per_row is not len(cells):

def try_parse_float(x):
 return float(x) if not x == "--" else 0

I don't think that's actually that ugly. However, even in Python you should pay attention to types, especially in a function that is meant to change the type! That will return an int 0 if not told otherwise.

for row in players_standings:
 csvwriter.writerow(row)

Python is pretty good at providing tools that operate on collections directly, so it's worth checking the docs before writing loops. csvwriter.writerows() might be the function you're after here.

In general writing to disk in larger chunks is preferable to writing in smaller chunks, especially on spinning hard disks where the seek time is considerable.

General naming conventions. The main goal of this script is to call
scrape_nhl_standings, should the other methods start with _ to
indicate they are "private"?

The concept of private methods is generally one from object oriented programming. While Python supports that, and you're right that underscores are the recommended way to indicate privacy in that context, that is not what you are doing here. That's not to say it wouldn't work; it just seems slightly out of place in this coding style. In any case, for matters of style, check PEP.

You could potentially shorten parse_standings_page by having a list of the parsing functions applicable to each index, and looping through the indices. You could also potentially avoid looping i by reshaping cells (into a 23 by rows_count 2D array) and then mapping a row parse function to it. However, I actually think what you have is clearer.

answered May 2 at 7:52

Josiah

3,182326

1

Very good catch about the terms of service. I didn't think about this at all when I started my project. Though it'll only be used by me, it's still something that I'll need to keep in mind for the future.
â€“Â IEatBagels
May 2 at 13:21

add a commentÂ |Â

up vote
5
down vote

A few more comment to add after @Josiah good answer:

You can simplify the driver management in scrape_nhl_standings by using a context manager. It seems like selenium doesn't support it natively but you can make use of contextlib.closing:

from contextlib import closing

def scrape_nhl_standings(csv_dump_path,start_year, end_year):
 with closing(webdriver.Chrome()) as driver:
 for year in range(start_year, end_year + 1):
 driver.get(build_url(year))
 ...

You could change try_parse_float to:
```
def try_parse_float(x):
 try:
 return float(x)
 except ValueError:
 return 0.0
```
so it is more robust, and possibly a tiny bit faster if most of the values are actually floats.

Did you read the note on os.makedirs?

Note: makedirs() will become confused if the path elements to create include pardir (eg. Ã¢Â€Âœ..Ã¢Â€Â on UNIX systems).

Instead, I recommend using dirname again:
```
file_folder = os.path.abspath(os.path.dirname(__file__))
csv_path = os.path.join(os.path.dirname(file_folder), 'data')
os.makedirs(csv_path, exist_ok=True)
```

answered May 2 at 8:27

Mathias Ettinger

21.8k32876

add a commentÂ |Â

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f193087%2fscraping-nhl-individual-stats%23new-answer', 'question_page');

);

Post as a guest

Name

2 Answers
2

active

oldest

votes

2 Answers
2

active

oldest

votes

up vote
6
down vote

accepted

+50

I will make a few points about the code as code, but the first thing to consider in any sort of data collection is respecting the rights of the owners of the data.

In particular the terms of service on the NHL site state

You may not access or use, or attempt to access or use, the Services to take any action that could harm us or any other person or entity (each a "person"), interfere with the operation of the Services, or use the Services in a manner that violates any laws. For example, you may not: ... Engage in unauthorized spidering, scraping, or harvesting of content or information, or use any other unauthorized automated means to compile information;

# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row

if not rows_count - math.floor(rows_count) == 0:
 raise ValueError("Cells count isn't divisible by cells per row.")

The // here is expressly integer division, and already does the flooring. As such the error you're trying to detect cannot be raised. It could instead be written

if rows_count * cells_per_row is not len(cells):

def try_parse_float(x):
 return float(x) if not x == "--" else 0

for row in players_standings:
 csvwriter.writerow(row)

Python is pretty good at providing tools that operate on collections directly, so it's worth checking the docs before writing loops. csvwriter.writerows() might be the function you're after here.

In general writing to disk in larger chunks is preferable to writing in smaller chunks, especially on spinning hard disks where the seek time is considerable.

General naming conventions. The main goal of this script is to call
scrape_nhl_standings, should the other methods start with _ to
indicate they are "private"?

answered May 2 at 7:52

Josiah

3,182326

1

Very good catch about the terms of service. I didn't think about this at all when I started my project. Though it'll only be used by me, it's still something that I'll need to keep in mind for the future.
â€“Â IEatBagels
May 2 at 13:21

add a commentÂ |Â

up vote
6
down vote

accepted

+50

I will make a few points about the code as code, but the first thing to consider in any sort of data collection is respecting the rights of the owners of the data.

In particular the terms of service on the NHL site state

You may not access or use, or attempt to access or use, the Services to take any action that could harm us or any other person or entity (each a "person"), interfere with the operation of the Services, or use the Services in a manner that violates any laws. For example, you may not: ... Engage in unauthorized spidering, scraping, or harvesting of content or information, or use any other unauthorized automated means to compile information;

# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row

if not rows_count - math.floor(rows_count) == 0:
 raise ValueError("Cells count isn't divisible by cells per row.")

The // here is expressly integer division, and already does the flooring. As such the error you're trying to detect cannot be raised. It could instead be written

if rows_count * cells_per_row is not len(cells):

def try_parse_float(x):
 return float(x) if not x == "--" else 0

for row in players_standings:
 csvwriter.writerow(row)

Python is pretty good at providing tools that operate on collections directly, so it's worth checking the docs before writing loops. csvwriter.writerows() might be the function you're after here.

In general writing to disk in larger chunks is preferable to writing in smaller chunks, especially on spinning hard disks where the seek time is considerable.

General naming conventions. The main goal of this script is to call
scrape_nhl_standings, should the other methods start with _ to
indicate they are "private"?

answered May 2 at 7:52

Josiah

3,182326

1

Very good catch about the terms of service. I didn't think about this at all when I started my project. Though it'll only be used by me, it's still something that I'll need to keep in mind for the future.
â€“Â IEatBagels
May 2 at 13:21

add a commentÂ |Â

up vote
6
down vote

accepted

+50

up vote
6
down vote

accepted

+50

I will make a few points about the code as code, but the first thing to consider in any sort of data collection is respecting the rights of the owners of the data.

In particular the terms of service on the NHL site state

You may not access or use, or attempt to access or use, the Services to take any action that could harm us or any other person or entity (each a "person"), interfere with the operation of the Services, or use the Services in a manner that violates any laws. For example, you may not: ... Engage in unauthorized spidering, scraping, or harvesting of content or information, or use any other unauthorized automated means to compile information;

# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row

if not rows_count - math.floor(rows_count) == 0:
 raise ValueError("Cells count isn't divisible by cells per row.")

The // here is expressly integer division, and already does the flooring. As such the error you're trying to detect cannot be raised. It could instead be written

if rows_count * cells_per_row is not len(cells):

def try_parse_float(x):
 return float(x) if not x == "--" else 0

for row in players_standings:
 csvwriter.writerow(row)

Python is pretty good at providing tools that operate on collections directly, so it's worth checking the docs before writing loops. csvwriter.writerows() might be the function you're after here.

In general writing to disk in larger chunks is preferable to writing in smaller chunks, especially on spinning hard disks where the seek time is considerable.

General naming conventions. The main goal of this script is to call
scrape_nhl_standings, should the other methods start with _ to
indicate they are "private"?

answered May 2 at 7:52

Josiah

3,182326

I will make a few points about the code as code, but the first thing to consider in any sort of data collection is respecting the rights of the owners of the data.

In particular the terms of service on the NHL site state

You may not access or use, or attempt to access or use, the Services to take any action that could harm us or any other person or entity (each a "person"), interfere with the operation of the Services, or use the Services in a manner that violates any laws. For example, you may not: ... Engage in unauthorized spidering, scraping, or harvesting of content or information, or use any other unauthorized automated means to compile information;

# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row

if not rows_count - math.floor(rows_count) == 0:
 raise ValueError("Cells count isn't divisible by cells per row.")

The // here is expressly integer division, and already does the flooring. As such the error you're trying to detect cannot be raised. It could instead be written

if rows_count * cells_per_row is not len(cells):

def try_parse_float(x):
 return float(x) if not x == "--" else 0

for row in players_standings:
 csvwriter.writerow(row)

Python is pretty good at providing tools that operate on collections directly, so it's worth checking the docs before writing loops. csvwriter.writerows() might be the function you're after here.

In general writing to disk in larger chunks is preferable to writing in smaller chunks, especially on spinning hard disks where the seek time is considerable.

General naming conventions. The main goal of this script is to call
scrape_nhl_standings, should the other methods start with _ to
indicate they are "private"?

answered May 2 at 7:52

Josiah

3,182326

answered May 2 at 7:52

Josiah

3,182326

answered May 2 at 7:52

Josiah

3,182326

answered May 2 at 7:52

Josiah

3,182326

1

Very good catch about the terms of service. I didn't think about this at all when I started my project. Though it'll only be used by me, it's still something that I'll need to keep in mind for the future.
â€“Â IEatBagels
May 2 at 13:21

add a commentÂ |Â

1

Very good catch about the terms of service. I didn't think about this at all when I started my project. Though it'll only be used by me, it's still something that I'll need to keep in mind for the future.
â€“Â IEatBagels
May 2 at 13:21

Very good catch about the terms of service. I didn't think about this at all when I started my project. Though it'll only be used by me, it's still something that I'll need to keep in mind for the future.
â€“Â IEatBagels
May 2 at 13:21

add a commentÂ |Â

up vote
5
down vote

A few more comment to add after @Josiah good answer:

You can simplify the driver management in scrape_nhl_standings by using a context manager. It seems like selenium doesn't support it natively but you can make use of contextlib.closing:

from contextlib import closing

def scrape_nhl_standings(csv_dump_path,start_year, end_year):
 with closing(webdriver.Chrome()) as driver:
 for year in range(start_year, end_year + 1):
 driver.get(build_url(year))
 ...

You could change try_parse_float to:
```
def try_parse_float(x):
 try:
 return float(x)
 except ValueError:
 return 0.0
```
so it is more robust, and possibly a tiny bit faster if most of the values are actually floats.

Did you read the note on os.makedirs?

Note: makedirs() will become confused if the path elements to create include pardir (eg. Ã¢Â€Âœ..Ã¢Â€Â on UNIX systems).

Instead, I recommend using dirname again:
```
file_folder = os.path.abspath(os.path.dirname(__file__))
csv_path = os.path.join(os.path.dirname(file_folder), 'data')
os.makedirs(csv_path, exist_ok=True)
```

answered May 2 at 8:27

Mathias Ettinger

21.8k32876

add a commentÂ |Â

up vote
5
down vote

A few more comment to add after @Josiah good answer:

You can simplify the driver management in scrape_nhl_standings by using a context manager. It seems like selenium doesn't support it natively but you can make use of contextlib.closing:

from contextlib import closing

def scrape_nhl_standings(csv_dump_path,start_year, end_year):
 with closing(webdriver.Chrome()) as driver:
 for year in range(start_year, end_year + 1):
 driver.get(build_url(year))
 ...

You could change try_parse_float to:
```
def try_parse_float(x):
 try:
 return float(x)
 except ValueError:
 return 0.0
```
so it is more robust, and possibly a tiny bit faster if most of the values are actually floats.

Did you read the note on os.makedirs?

Note: makedirs() will become confused if the path elements to create include pardir (eg. Ã¢Â€Âœ..Ã¢Â€Â on UNIX systems).

Instead, I recommend using dirname again:
```
file_folder = os.path.abspath(os.path.dirname(__file__))
csv_path = os.path.join(os.path.dirname(file_folder), 'data')
os.makedirs(csv_path, exist_ok=True)
```

answered May 2 at 8:27

Mathias Ettinger

21.8k32876

add a commentÂ |Â

up vote
5
down vote

A few more comment to add after @Josiah good answer:

You can simplify the driver management in scrape_nhl_standings by using a context manager. It seems like selenium doesn't support it natively but you can make use of contextlib.closing:

from contextlib import closing

def scrape_nhl_standings(csv_dump_path,start_year, end_year):
 with closing(webdriver.Chrome()) as driver:
 for year in range(start_year, end_year + 1):
 driver.get(build_url(year))
 ...

You could change try_parse_float to:
```
def try_parse_float(x):
 try:
 return float(x)
 except ValueError:
 return 0.0
```
so it is more robust, and possibly a tiny bit faster if most of the values are actually floats.

Did you read the note on os.makedirs?

Note: makedirs() will become confused if the path elements to create include pardir (eg. Ã¢Â€Âœ..Ã¢Â€Â on UNIX systems).

Instead, I recommend using dirname again:
```
file_folder = os.path.abspath(os.path.dirname(__file__))
csv_path = os.path.join(os.path.dirname(file_folder), 'data')
os.makedirs(csv_path, exist_ok=True)
```

answered May 2 at 8:27

Mathias Ettinger

21.8k32876

A few more comment to add after @Josiah good answer:

You can simplify the driver management in scrape_nhl_standings by using a context manager. It seems like selenium doesn't support it natively but you can make use of contextlib.closing:

from contextlib import closing

def scrape_nhl_standings(csv_dump_path,start_year, end_year):
 with closing(webdriver.Chrome()) as driver:
 for year in range(start_year, end_year + 1):
 driver.get(build_url(year))
 ...

You could change try_parse_float to:
```
def try_parse_float(x):
 try:
 return float(x)
 except ValueError:
 return 0.0
```
so it is more robust, and possibly a tiny bit faster if most of the values are actually floats.

Did you read the note on os.makedirs?

Note: makedirs() will become confused if the path elements to create include pardir (eg. Ã¢Â€Âœ..Ã¢Â€Â on UNIX systems).

Instead, I recommend using dirname again:
```
file_folder = os.path.abspath(os.path.dirname(__file__))
csv_path = os.path.join(os.path.dirname(file_folder), 'data')
os.makedirs(csv_path, exist_ok=True)
```

answered May 2 at 8:27

Mathias Ettinger

21.8k32876

answered May 2 at 8:27

Mathias Ettinger

21.8k32876

answered May 2 at 8:27

Mathias Ettinger

21.8k32876

answered May 2 at 8:27

Mathias Ettinger

21.8k32876

add a commentÂ |Â

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr