Scraping NHL Individual stats
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
6
down vote
favorite
With the idea of gathering the biggest hockey's individual stats dataset ever, I've started with the big league, the NHL.
Using Selenium, I've been able to scrape this NHL's statistics page.
To explain the workflow simply, I extract the data from the standing's table. It's basically a big string with a lots of "n"
to split the data. Then I "click" on the next button, redo, until the whole list is passed. Afterwards I change the NHL season with the URL and redo the steps above.
Everything works fine, but I have some concerns about readability/maintainability.
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import math
import csv
import os
csv_header = ["#", "Player", "Season", "Team", "Pos", "GP", "G", "A", "P", "+/-", "PIM", "P/GP", "PPG",
"PPP", "SHG", "SHP", "GWG", "OTG",
"S", "S%", "TOI/GP", "Shifts/GP", "FOW%"]
def scrape_nhl_standings(csv_dump_path,start_year, end_year):
try:
driver = webdriver.Chrome()
for year in range(start_year, end_year + 1):
driver.get(build_url(year))
# There is a weird bug on the NHL website where the request for 07-08 season also returns the 06-07 so we set it back.
if year == 2007:
year_range = Select(driver.find_element_by_class_name("filter__range").find_element_by_tag_name("select"))
year_range.select_by_value("20072008")
driver.find_element_by_class_name("go").click()
# Force Selenium to wait for the page to reload
driver.implicitly_wait(5)
# We save about 2 seconds by setting the page size to 100.
set_page_size_to_100(driver)
players_standings =
next_page_button = driver.find_element_by_class_name("-next").find_element_by_tag_name("button")
while next_page_button.get_attribute("disabled") is None:
table_standings_page = driver.find_element_by_class_name("rt-tbody")
players_standings += parse_standings_page(table_standings_page.text)
next_page_button.click()
write_to_csv(csv_dump_path, players_standings, year)
print("Finished season 0-1".format(year, year+1))
finally:
# noinspection PyUnboundLocalVariable
driver.close()
def build_url(seasons_start_year):
year_string = str(seasons_start_year) + str(seasons_start_year+1)
return "http://www.nhl.com/stats/player?reportType=season"
"&seasonFrom=0"
"&seasonTo=0"
"&gameType=2"
"&filter=gamesPlayed,gte,1"
"&sort=points,goals,assists".format(year_string)
def set_page_size_to_100(driver):
page_size_dropdown = Select(driver
.find_element_by_class_name("-pageSizeOptions")
.find_element_by_tag_name("select"))
page_size_dropdown.select_by_value("100")
def parse_standings_page(standings):
players_standings =
cells_per_row = 23
cells = standings.split('n')
# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row
if not rows_count - math.floor(rows_count) == 0:
raise ValueError("Cells count isn't divisible by cells per row.")
for i in range(0,int(rows_count)):
row = cells[i * 23: (i + 1) * 23]
row[0] = int(row[0]) # standing
row[5] = int(row[5]) # Game Played
row[6] = int(row[6]) # Goals
row[7] = int(row[7]) # Assists
row[8] = int(row[8]) # Points
row[9] = int(row[9]) # Plus/Minus
row[10] = int(row[10]) # PIM
row[11] = try_parse_float(row[11]) # P/GP
row[12] = int(row[12]) # PPG
row[13] = int(row[13]) # PPP
row[14] = int(row[14]) # SHG
row[15] = int(row[15]) # SHP
row[16] = int(row[16]) # GWG
row[17] = int(row[17]) # OTG
row[18] = int(row[18]) # Shots
row[19] = try_parse_float(row[19]) # Shot %
row[21] = try_parse_float(row[21]) # Shifts/GP
row[22] = try_parse_float(row[22]) # FOW%
players_standings.append(row)
return players_standings
def try_parse_float(x):
return float(x) if not x == "--" else 0
def write_to_csv(csv_dump_path, players_standings, year):
with open(csv_dump_path+"0-1 NHL Standings.csv".format(year, year+1), "w+") as csvfile:
csvwriter = csv.writer(csvfile, delimiter=",")
csvwriter.writerow(csv_header)
for row in players_standings:
csvwriter.writerow(row)
if __name__ == "__main__":
csv_path = os.path.dirname(__file__)+"/../data/"
if not os.path.exists(csv_path):
os.makedirs(csv_path)
scrape_nhl_standings(csv_path, start_year=2007, end_year=2007)
My main concerns are :
- The
parse_standings_page
function. For each cell I need to parse toint
orfloat
some values. Sometimes, if a player didn't have a single point, the Points Per Game Played (P/GP) stat will contain"--"
. This makes for really ugly code, but I can't seem to find a simple way to parse all these values simply. Plus, using indexer for each cells makes it not so readable and it's easy to mess up, I'd like to have a more verbose approach. - General naming conventions. The main goal of this script is to call
scrape_nhl_standings
, should the other methods start with_
to indicate they are "private"?
Of course I'm 100% open to anything else that would improve my code.
python python-3.x web-scraping selenium
add a comment |Â
up vote
6
down vote
favorite
With the idea of gathering the biggest hockey's individual stats dataset ever, I've started with the big league, the NHL.
Using Selenium, I've been able to scrape this NHL's statistics page.
To explain the workflow simply, I extract the data from the standing's table. It's basically a big string with a lots of "n"
to split the data. Then I "click" on the next button, redo, until the whole list is passed. Afterwards I change the NHL season with the URL and redo the steps above.
Everything works fine, but I have some concerns about readability/maintainability.
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import math
import csv
import os
csv_header = ["#", "Player", "Season", "Team", "Pos", "GP", "G", "A", "P", "+/-", "PIM", "P/GP", "PPG",
"PPP", "SHG", "SHP", "GWG", "OTG",
"S", "S%", "TOI/GP", "Shifts/GP", "FOW%"]
def scrape_nhl_standings(csv_dump_path,start_year, end_year):
try:
driver = webdriver.Chrome()
for year in range(start_year, end_year + 1):
driver.get(build_url(year))
# There is a weird bug on the NHL website where the request for 07-08 season also returns the 06-07 so we set it back.
if year == 2007:
year_range = Select(driver.find_element_by_class_name("filter__range").find_element_by_tag_name("select"))
year_range.select_by_value("20072008")
driver.find_element_by_class_name("go").click()
# Force Selenium to wait for the page to reload
driver.implicitly_wait(5)
# We save about 2 seconds by setting the page size to 100.
set_page_size_to_100(driver)
players_standings =
next_page_button = driver.find_element_by_class_name("-next").find_element_by_tag_name("button")
while next_page_button.get_attribute("disabled") is None:
table_standings_page = driver.find_element_by_class_name("rt-tbody")
players_standings += parse_standings_page(table_standings_page.text)
next_page_button.click()
write_to_csv(csv_dump_path, players_standings, year)
print("Finished season 0-1".format(year, year+1))
finally:
# noinspection PyUnboundLocalVariable
driver.close()
def build_url(seasons_start_year):
year_string = str(seasons_start_year) + str(seasons_start_year+1)
return "http://www.nhl.com/stats/player?reportType=season"
"&seasonFrom=0"
"&seasonTo=0"
"&gameType=2"
"&filter=gamesPlayed,gte,1"
"&sort=points,goals,assists".format(year_string)
def set_page_size_to_100(driver):
page_size_dropdown = Select(driver
.find_element_by_class_name("-pageSizeOptions")
.find_element_by_tag_name("select"))
page_size_dropdown.select_by_value("100")
def parse_standings_page(standings):
players_standings =
cells_per_row = 23
cells = standings.split('n')
# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row
if not rows_count - math.floor(rows_count) == 0:
raise ValueError("Cells count isn't divisible by cells per row.")
for i in range(0,int(rows_count)):
row = cells[i * 23: (i + 1) * 23]
row[0] = int(row[0]) # standing
row[5] = int(row[5]) # Game Played
row[6] = int(row[6]) # Goals
row[7] = int(row[7]) # Assists
row[8] = int(row[8]) # Points
row[9] = int(row[9]) # Plus/Minus
row[10] = int(row[10]) # PIM
row[11] = try_parse_float(row[11]) # P/GP
row[12] = int(row[12]) # PPG
row[13] = int(row[13]) # PPP
row[14] = int(row[14]) # SHG
row[15] = int(row[15]) # SHP
row[16] = int(row[16]) # GWG
row[17] = int(row[17]) # OTG
row[18] = int(row[18]) # Shots
row[19] = try_parse_float(row[19]) # Shot %
row[21] = try_parse_float(row[21]) # Shifts/GP
row[22] = try_parse_float(row[22]) # FOW%
players_standings.append(row)
return players_standings
def try_parse_float(x):
return float(x) if not x == "--" else 0
def write_to_csv(csv_dump_path, players_standings, year):
with open(csv_dump_path+"0-1 NHL Standings.csv".format(year, year+1), "w+") as csvfile:
csvwriter = csv.writer(csvfile, delimiter=",")
csvwriter.writerow(csv_header)
for row in players_standings:
csvwriter.writerow(row)
if __name__ == "__main__":
csv_path = os.path.dirname(__file__)+"/../data/"
if not os.path.exists(csv_path):
os.makedirs(csv_path)
scrape_nhl_standings(csv_path, start_year=2007, end_year=2007)
My main concerns are :
- The
parse_standings_page
function. For each cell I need to parse toint
orfloat
some values. Sometimes, if a player didn't have a single point, the Points Per Game Played (P/GP) stat will contain"--"
. This makes for really ugly code, but I can't seem to find a simple way to parse all these values simply. Plus, using indexer for each cells makes it not so readable and it's easy to mess up, I'd like to have a more verbose approach. - General naming conventions. The main goal of this script is to call
scrape_nhl_standings
, should the other methods start with_
to indicate they are "private"?
Of course I'm 100% open to anything else that would improve my code.
python python-3.x web-scraping selenium
add a comment |Â
up vote
6
down vote
favorite
up vote
6
down vote
favorite
With the idea of gathering the biggest hockey's individual stats dataset ever, I've started with the big league, the NHL.
Using Selenium, I've been able to scrape this NHL's statistics page.
To explain the workflow simply, I extract the data from the standing's table. It's basically a big string with a lots of "n"
to split the data. Then I "click" on the next button, redo, until the whole list is passed. Afterwards I change the NHL season with the URL and redo the steps above.
Everything works fine, but I have some concerns about readability/maintainability.
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import math
import csv
import os
csv_header = ["#", "Player", "Season", "Team", "Pos", "GP", "G", "A", "P", "+/-", "PIM", "P/GP", "PPG",
"PPP", "SHG", "SHP", "GWG", "OTG",
"S", "S%", "TOI/GP", "Shifts/GP", "FOW%"]
def scrape_nhl_standings(csv_dump_path,start_year, end_year):
try:
driver = webdriver.Chrome()
for year in range(start_year, end_year + 1):
driver.get(build_url(year))
# There is a weird bug on the NHL website where the request for 07-08 season also returns the 06-07 so we set it back.
if year == 2007:
year_range = Select(driver.find_element_by_class_name("filter__range").find_element_by_tag_name("select"))
year_range.select_by_value("20072008")
driver.find_element_by_class_name("go").click()
# Force Selenium to wait for the page to reload
driver.implicitly_wait(5)
# We save about 2 seconds by setting the page size to 100.
set_page_size_to_100(driver)
players_standings =
next_page_button = driver.find_element_by_class_name("-next").find_element_by_tag_name("button")
while next_page_button.get_attribute("disabled") is None:
table_standings_page = driver.find_element_by_class_name("rt-tbody")
players_standings += parse_standings_page(table_standings_page.text)
next_page_button.click()
write_to_csv(csv_dump_path, players_standings, year)
print("Finished season 0-1".format(year, year+1))
finally:
# noinspection PyUnboundLocalVariable
driver.close()
def build_url(seasons_start_year):
year_string = str(seasons_start_year) + str(seasons_start_year+1)
return "http://www.nhl.com/stats/player?reportType=season"
"&seasonFrom=0"
"&seasonTo=0"
"&gameType=2"
"&filter=gamesPlayed,gte,1"
"&sort=points,goals,assists".format(year_string)
def set_page_size_to_100(driver):
page_size_dropdown = Select(driver
.find_element_by_class_name("-pageSizeOptions")
.find_element_by_tag_name("select"))
page_size_dropdown.select_by_value("100")
def parse_standings_page(standings):
players_standings =
cells_per_row = 23
cells = standings.split('n')
# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row
if not rows_count - math.floor(rows_count) == 0:
raise ValueError("Cells count isn't divisible by cells per row.")
for i in range(0,int(rows_count)):
row = cells[i * 23: (i + 1) * 23]
row[0] = int(row[0]) # standing
row[5] = int(row[5]) # Game Played
row[6] = int(row[6]) # Goals
row[7] = int(row[7]) # Assists
row[8] = int(row[8]) # Points
row[9] = int(row[9]) # Plus/Minus
row[10] = int(row[10]) # PIM
row[11] = try_parse_float(row[11]) # P/GP
row[12] = int(row[12]) # PPG
row[13] = int(row[13]) # PPP
row[14] = int(row[14]) # SHG
row[15] = int(row[15]) # SHP
row[16] = int(row[16]) # GWG
row[17] = int(row[17]) # OTG
row[18] = int(row[18]) # Shots
row[19] = try_parse_float(row[19]) # Shot %
row[21] = try_parse_float(row[21]) # Shifts/GP
row[22] = try_parse_float(row[22]) # FOW%
players_standings.append(row)
return players_standings
def try_parse_float(x):
return float(x) if not x == "--" else 0
def write_to_csv(csv_dump_path, players_standings, year):
with open(csv_dump_path+"0-1 NHL Standings.csv".format(year, year+1), "w+") as csvfile:
csvwriter = csv.writer(csvfile, delimiter=",")
csvwriter.writerow(csv_header)
for row in players_standings:
csvwriter.writerow(row)
if __name__ == "__main__":
csv_path = os.path.dirname(__file__)+"/../data/"
if not os.path.exists(csv_path):
os.makedirs(csv_path)
scrape_nhl_standings(csv_path, start_year=2007, end_year=2007)
My main concerns are :
- The
parse_standings_page
function. For each cell I need to parse toint
orfloat
some values. Sometimes, if a player didn't have a single point, the Points Per Game Played (P/GP) stat will contain"--"
. This makes for really ugly code, but I can't seem to find a simple way to parse all these values simply. Plus, using indexer for each cells makes it not so readable and it's easy to mess up, I'd like to have a more verbose approach. - General naming conventions. The main goal of this script is to call
scrape_nhl_standings
, should the other methods start with_
to indicate they are "private"?
Of course I'm 100% open to anything else that would improve my code.
python python-3.x web-scraping selenium
With the idea of gathering the biggest hockey's individual stats dataset ever, I've started with the big league, the NHL.
Using Selenium, I've been able to scrape this NHL's statistics page.
To explain the workflow simply, I extract the data from the standing's table. It's basically a big string with a lots of "n"
to split the data. Then I "click" on the next button, redo, until the whole list is passed. Afterwards I change the NHL season with the URL and redo the steps above.
Everything works fine, but I have some concerns about readability/maintainability.
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import math
import csv
import os
csv_header = ["#", "Player", "Season", "Team", "Pos", "GP", "G", "A", "P", "+/-", "PIM", "P/GP", "PPG",
"PPP", "SHG", "SHP", "GWG", "OTG",
"S", "S%", "TOI/GP", "Shifts/GP", "FOW%"]
def scrape_nhl_standings(csv_dump_path,start_year, end_year):
try:
driver = webdriver.Chrome()
for year in range(start_year, end_year + 1):
driver.get(build_url(year))
# There is a weird bug on the NHL website where the request for 07-08 season also returns the 06-07 so we set it back.
if year == 2007:
year_range = Select(driver.find_element_by_class_name("filter__range").find_element_by_tag_name("select"))
year_range.select_by_value("20072008")
driver.find_element_by_class_name("go").click()
# Force Selenium to wait for the page to reload
driver.implicitly_wait(5)
# We save about 2 seconds by setting the page size to 100.
set_page_size_to_100(driver)
players_standings =
next_page_button = driver.find_element_by_class_name("-next").find_element_by_tag_name("button")
while next_page_button.get_attribute("disabled") is None:
table_standings_page = driver.find_element_by_class_name("rt-tbody")
players_standings += parse_standings_page(table_standings_page.text)
next_page_button.click()
write_to_csv(csv_dump_path, players_standings, year)
print("Finished season 0-1".format(year, year+1))
finally:
# noinspection PyUnboundLocalVariable
driver.close()
def build_url(seasons_start_year):
year_string = str(seasons_start_year) + str(seasons_start_year+1)
return "http://www.nhl.com/stats/player?reportType=season"
"&seasonFrom=0"
"&seasonTo=0"
"&gameType=2"
"&filter=gamesPlayed,gte,1"
"&sort=points,goals,assists".format(year_string)
def set_page_size_to_100(driver):
page_size_dropdown = Select(driver
.find_element_by_class_name("-pageSizeOptions")
.find_element_by_tag_name("select"))
page_size_dropdown.select_by_value("100")
def parse_standings_page(standings):
players_standings =
cells_per_row = 23
cells = standings.split('n')
# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row
if not rows_count - math.floor(rows_count) == 0:
raise ValueError("Cells count isn't divisible by cells per row.")
for i in range(0,int(rows_count)):
row = cells[i * 23: (i + 1) * 23]
row[0] = int(row[0]) # standing
row[5] = int(row[5]) # Game Played
row[6] = int(row[6]) # Goals
row[7] = int(row[7]) # Assists
row[8] = int(row[8]) # Points
row[9] = int(row[9]) # Plus/Minus
row[10] = int(row[10]) # PIM
row[11] = try_parse_float(row[11]) # P/GP
row[12] = int(row[12]) # PPG
row[13] = int(row[13]) # PPP
row[14] = int(row[14]) # SHG
row[15] = int(row[15]) # SHP
row[16] = int(row[16]) # GWG
row[17] = int(row[17]) # OTG
row[18] = int(row[18]) # Shots
row[19] = try_parse_float(row[19]) # Shot %
row[21] = try_parse_float(row[21]) # Shifts/GP
row[22] = try_parse_float(row[22]) # FOW%
players_standings.append(row)
return players_standings
def try_parse_float(x):
return float(x) if not x == "--" else 0
def write_to_csv(csv_dump_path, players_standings, year):
with open(csv_dump_path+"0-1 NHL Standings.csv".format(year, year+1), "w+") as csvfile:
csvwriter = csv.writer(csvfile, delimiter=",")
csvwriter.writerow(csv_header)
for row in players_standings:
csvwriter.writerow(row)
if __name__ == "__main__":
csv_path = os.path.dirname(__file__)+"/../data/"
if not os.path.exists(csv_path):
os.makedirs(csv_path)
scrape_nhl_standings(csv_path, start_year=2007, end_year=2007)
My main concerns are :
- The
parse_standings_page
function. For each cell I need to parse toint
orfloat
some values. Sometimes, if a player didn't have a single point, the Points Per Game Played (P/GP) stat will contain"--"
. This makes for really ugly code, but I can't seem to find a simple way to parse all these values simply. Plus, using indexer for each cells makes it not so readable and it's easy to mess up, I'd like to have a more verbose approach. - General naming conventions. The main goal of this script is to call
scrape_nhl_standings
, should the other methods start with_
to indicate they are "private"?
Of course I'm 100% open to anything else that would improve my code.
python python-3.x web-scraping selenium
edited Apr 27 at 15:48
Peilonrayz
24.3k336102
24.3k336102
asked Apr 27 at 15:32
IEatBagels
8,50623077
8,50623077
add a comment |Â
add a comment |Â
2 Answers
2
active
oldest
votes
up vote
6
down vote
accepted
I will make a few points about the code as code, but the first thing to consider in any sort of data collection is respecting the rights of the owners of the data.
In particular the terms of service on the NHL site state
You may not access or use, or attempt to access or use, the Services to take any action that could harm us or any other person or entity (each a "person"), interfere with the operation of the Services, or use the Services in a manner that violates any laws. For example, you may not: ... Engage in unauthorized spidering, scraping, or harvesting of content or information, or use any other unauthorized automated means to compile information;
Therefore consider that using this script for the purpose it is written is probably illegal. If you actually set yourself up in competition to the site using data gained in this manner, you would almost guarantee yourself a lawsuit to defend.
# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row
if not rows_count - math.floor(rows_count) == 0:
raise ValueError("Cells count isn't divisible by cells per row.")
The //
here is expressly integer division, and already does the flooring. As such the error you're trying to detect cannot be raised. It could instead be written
if rows_count * cells_per_row is not len(cells):
def try_parse_float(x):
return float(x) if not x == "--" else 0
I don't think that's actually that ugly. However, even in Python you should pay attention to types, especially in a function that is meant to change the type! That will return an int 0 if not told otherwise.
for row in players_standings:
csvwriter.writerow(row)
Python is pretty good at providing tools that operate on collections directly, so it's worth checking the docs before writing loops. csvwriter.writerows()
might be the function you're after here.
In general writing to disk in larger chunks is preferable to writing in smaller chunks, especially on spinning hard disks where the seek time is considerable.
General naming conventions. The main goal of this script is to call
scrape_nhl_standings, should the other methods start with _ to
indicate they are "private"?
The concept of private methods is generally one from object oriented programming. While Python supports that, and you're right that underscores are the recommended way to indicate privacy in that context, that is not what you are doing here. That's not to say it wouldn't work; it just seems slightly out of place in this coding style. In any case, for matters of style, check PEP.
You could potentially shorten parse_standings_page
by having a list of the parsing functions applicable to each index, and looping through the indices. You could also potentially avoid looping i
by reshaping cells (into a 23 by rows_count
2D array) and then mapping a row parse function to it. However, I actually think what you have is clearer.
1
Very good catch about the terms of service. I didn't think about this at all when I started my project. Though it'll only be used by me, it's still something that I'll need to keep in mind for the future.
â IEatBagels
May 2 at 13:21
add a comment |Â
up vote
5
down vote
A few more comment to add after @Josiah good answer:
You can simplify the driver management in
scrape_nhl_standings
by using a context manager. It seems like selenium doesn't support it natively but you can make use ofcontextlib.closing
:from contextlib import closing
def scrape_nhl_standings(csv_dump_path,start_year, end_year):
with closing(webdriver.Chrome()) as driver:
for year in range(start_year, end_year + 1):
driver.get(build_url(year))
...You could change
try_parse_float
to:def try_parse_float(x):
try:
return float(x)
except ValueError:
return 0.0so it is more robust, and possibly a tiny bit faster if most of the values are actually floats.
Did you read the note on
os.makedirs
?
Note:
makedirs()
will become confused if the path elements to create includepardir
(eg. âÂÂ..â on UNIX systems).Instead, I recommend using
dirname
again:file_folder = os.path.abspath(os.path.dirname(__file__))
csv_path = os.path.join(os.path.dirname(file_folder), 'data')
os.makedirs(csv_path, exist_ok=True)
add a comment |Â
2 Answers
2
active
oldest
votes
2 Answers
2
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
6
down vote
accepted
I will make a few points about the code as code, but the first thing to consider in any sort of data collection is respecting the rights of the owners of the data.
In particular the terms of service on the NHL site state
You may not access or use, or attempt to access or use, the Services to take any action that could harm us or any other person or entity (each a "person"), interfere with the operation of the Services, or use the Services in a manner that violates any laws. For example, you may not: ... Engage in unauthorized spidering, scraping, or harvesting of content or information, or use any other unauthorized automated means to compile information;
Therefore consider that using this script for the purpose it is written is probably illegal. If you actually set yourself up in competition to the site using data gained in this manner, you would almost guarantee yourself a lawsuit to defend.
# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row
if not rows_count - math.floor(rows_count) == 0:
raise ValueError("Cells count isn't divisible by cells per row.")
The //
here is expressly integer division, and already does the flooring. As such the error you're trying to detect cannot be raised. It could instead be written
if rows_count * cells_per_row is not len(cells):
def try_parse_float(x):
return float(x) if not x == "--" else 0
I don't think that's actually that ugly. However, even in Python you should pay attention to types, especially in a function that is meant to change the type! That will return an int 0 if not told otherwise.
for row in players_standings:
csvwriter.writerow(row)
Python is pretty good at providing tools that operate on collections directly, so it's worth checking the docs before writing loops. csvwriter.writerows()
might be the function you're after here.
In general writing to disk in larger chunks is preferable to writing in smaller chunks, especially on spinning hard disks where the seek time is considerable.
General naming conventions. The main goal of this script is to call
scrape_nhl_standings, should the other methods start with _ to
indicate they are "private"?
The concept of private methods is generally one from object oriented programming. While Python supports that, and you're right that underscores are the recommended way to indicate privacy in that context, that is not what you are doing here. That's not to say it wouldn't work; it just seems slightly out of place in this coding style. In any case, for matters of style, check PEP.
You could potentially shorten parse_standings_page
by having a list of the parsing functions applicable to each index, and looping through the indices. You could also potentially avoid looping i
by reshaping cells (into a 23 by rows_count
2D array) and then mapping a row parse function to it. However, I actually think what you have is clearer.
1
Very good catch about the terms of service. I didn't think about this at all when I started my project. Though it'll only be used by me, it's still something that I'll need to keep in mind for the future.
â IEatBagels
May 2 at 13:21
add a comment |Â
up vote
6
down vote
accepted
I will make a few points about the code as code, but the first thing to consider in any sort of data collection is respecting the rights of the owners of the data.
In particular the terms of service on the NHL site state
You may not access or use, or attempt to access or use, the Services to take any action that could harm us or any other person or entity (each a "person"), interfere with the operation of the Services, or use the Services in a manner that violates any laws. For example, you may not: ... Engage in unauthorized spidering, scraping, or harvesting of content or information, or use any other unauthorized automated means to compile information;
Therefore consider that using this script for the purpose it is written is probably illegal. If you actually set yourself up in competition to the site using data gained in this manner, you would almost guarantee yourself a lawsuit to defend.
# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row
if not rows_count - math.floor(rows_count) == 0:
raise ValueError("Cells count isn't divisible by cells per row.")
The //
here is expressly integer division, and already does the flooring. As such the error you're trying to detect cannot be raised. It could instead be written
if rows_count * cells_per_row is not len(cells):
def try_parse_float(x):
return float(x) if not x == "--" else 0
I don't think that's actually that ugly. However, even in Python you should pay attention to types, especially in a function that is meant to change the type! That will return an int 0 if not told otherwise.
for row in players_standings:
csvwriter.writerow(row)
Python is pretty good at providing tools that operate on collections directly, so it's worth checking the docs before writing loops. csvwriter.writerows()
might be the function you're after here.
In general writing to disk in larger chunks is preferable to writing in smaller chunks, especially on spinning hard disks where the seek time is considerable.
General naming conventions. The main goal of this script is to call
scrape_nhl_standings, should the other methods start with _ to
indicate they are "private"?
The concept of private methods is generally one from object oriented programming. While Python supports that, and you're right that underscores are the recommended way to indicate privacy in that context, that is not what you are doing here. That's not to say it wouldn't work; it just seems slightly out of place in this coding style. In any case, for matters of style, check PEP.
You could potentially shorten parse_standings_page
by having a list of the parsing functions applicable to each index, and looping through the indices. You could also potentially avoid looping i
by reshaping cells (into a 23 by rows_count
2D array) and then mapping a row parse function to it. However, I actually think what you have is clearer.
1
Very good catch about the terms of service. I didn't think about this at all when I started my project. Though it'll only be used by me, it's still something that I'll need to keep in mind for the future.
â IEatBagels
May 2 at 13:21
add a comment |Â
up vote
6
down vote
accepted
up vote
6
down vote
accepted
I will make a few points about the code as code, but the first thing to consider in any sort of data collection is respecting the rights of the owners of the data.
In particular the terms of service on the NHL site state
You may not access or use, or attempt to access or use, the Services to take any action that could harm us or any other person or entity (each a "person"), interfere with the operation of the Services, or use the Services in a manner that violates any laws. For example, you may not: ... Engage in unauthorized spidering, scraping, or harvesting of content or information, or use any other unauthorized automated means to compile information;
Therefore consider that using this script for the purpose it is written is probably illegal. If you actually set yourself up in competition to the site using data gained in this manner, you would almost guarantee yourself a lawsuit to defend.
# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row
if not rows_count - math.floor(rows_count) == 0:
raise ValueError("Cells count isn't divisible by cells per row.")
The //
here is expressly integer division, and already does the flooring. As such the error you're trying to detect cannot be raised. It could instead be written
if rows_count * cells_per_row is not len(cells):
def try_parse_float(x):
return float(x) if not x == "--" else 0
I don't think that's actually that ugly. However, even in Python you should pay attention to types, especially in a function that is meant to change the type! That will return an int 0 if not told otherwise.
for row in players_standings:
csvwriter.writerow(row)
Python is pretty good at providing tools that operate on collections directly, so it's worth checking the docs before writing loops. csvwriter.writerows()
might be the function you're after here.
In general writing to disk in larger chunks is preferable to writing in smaller chunks, especially on spinning hard disks where the seek time is considerable.
General naming conventions. The main goal of this script is to call
scrape_nhl_standings, should the other methods start with _ to
indicate they are "private"?
The concept of private methods is generally one from object oriented programming. While Python supports that, and you're right that underscores are the recommended way to indicate privacy in that context, that is not what you are doing here. That's not to say it wouldn't work; it just seems slightly out of place in this coding style. In any case, for matters of style, check PEP.
You could potentially shorten parse_standings_page
by having a list of the parsing functions applicable to each index, and looping through the indices. You could also potentially avoid looping i
by reshaping cells (into a 23 by rows_count
2D array) and then mapping a row parse function to it. However, I actually think what you have is clearer.
I will make a few points about the code as code, but the first thing to consider in any sort of data collection is respecting the rights of the owners of the data.
In particular the terms of service on the NHL site state
You may not access or use, or attempt to access or use, the Services to take any action that could harm us or any other person or entity (each a "person"), interfere with the operation of the Services, or use the Services in a manner that violates any laws. For example, you may not: ... Engage in unauthorized spidering, scraping, or harvesting of content or information, or use any other unauthorized automated means to compile information;
Therefore consider that using this script for the purpose it is written is probably illegal. If you actually set yourself up in competition to the site using data gained in this manner, you would almost guarantee yourself a lawsuit to defend.
# There's a problem with markup here, below // isn't a comment
rows_count = len(cells) // cells_per_row
if not rows_count - math.floor(rows_count) == 0:
raise ValueError("Cells count isn't divisible by cells per row.")
The //
here is expressly integer division, and already does the flooring. As such the error you're trying to detect cannot be raised. It could instead be written
if rows_count * cells_per_row is not len(cells):
def try_parse_float(x):
return float(x) if not x == "--" else 0
I don't think that's actually that ugly. However, even in Python you should pay attention to types, especially in a function that is meant to change the type! That will return an int 0 if not told otherwise.
for row in players_standings:
csvwriter.writerow(row)
Python is pretty good at providing tools that operate on collections directly, so it's worth checking the docs before writing loops. csvwriter.writerows()
might be the function you're after here.
In general writing to disk in larger chunks is preferable to writing in smaller chunks, especially on spinning hard disks where the seek time is considerable.
General naming conventions. The main goal of this script is to call
scrape_nhl_standings, should the other methods start with _ to
indicate they are "private"?
The concept of private methods is generally one from object oriented programming. While Python supports that, and you're right that underscores are the recommended way to indicate privacy in that context, that is not what you are doing here. That's not to say it wouldn't work; it just seems slightly out of place in this coding style. In any case, for matters of style, check PEP.
You could potentially shorten parse_standings_page
by having a list of the parsing functions applicable to each index, and looping through the indices. You could also potentially avoid looping i
by reshaping cells (into a 23 by rows_count
2D array) and then mapping a row parse function to it. However, I actually think what you have is clearer.
answered May 2 at 7:52
Josiah
3,182326
3,182326
1
Very good catch about the terms of service. I didn't think about this at all when I started my project. Though it'll only be used by me, it's still something that I'll need to keep in mind for the future.
â IEatBagels
May 2 at 13:21
add a comment |Â
1
Very good catch about the terms of service. I didn't think about this at all when I started my project. Though it'll only be used by me, it's still something that I'll need to keep in mind for the future.
â IEatBagels
May 2 at 13:21
1
1
Very good catch about the terms of service. I didn't think about this at all when I started my project. Though it'll only be used by me, it's still something that I'll need to keep in mind for the future.
â IEatBagels
May 2 at 13:21
Very good catch about the terms of service. I didn't think about this at all when I started my project. Though it'll only be used by me, it's still something that I'll need to keep in mind for the future.
â IEatBagels
May 2 at 13:21
add a comment |Â
up vote
5
down vote
A few more comment to add after @Josiah good answer:
You can simplify the driver management in
scrape_nhl_standings
by using a context manager. It seems like selenium doesn't support it natively but you can make use ofcontextlib.closing
:from contextlib import closing
def scrape_nhl_standings(csv_dump_path,start_year, end_year):
with closing(webdriver.Chrome()) as driver:
for year in range(start_year, end_year + 1):
driver.get(build_url(year))
...You could change
try_parse_float
to:def try_parse_float(x):
try:
return float(x)
except ValueError:
return 0.0so it is more robust, and possibly a tiny bit faster if most of the values are actually floats.
Did you read the note on
os.makedirs
?
Note:
makedirs()
will become confused if the path elements to create includepardir
(eg. âÂÂ..â on UNIX systems).Instead, I recommend using
dirname
again:file_folder = os.path.abspath(os.path.dirname(__file__))
csv_path = os.path.join(os.path.dirname(file_folder), 'data')
os.makedirs(csv_path, exist_ok=True)
add a comment |Â
up vote
5
down vote
A few more comment to add after @Josiah good answer:
You can simplify the driver management in
scrape_nhl_standings
by using a context manager. It seems like selenium doesn't support it natively but you can make use ofcontextlib.closing
:from contextlib import closing
def scrape_nhl_standings(csv_dump_path,start_year, end_year):
with closing(webdriver.Chrome()) as driver:
for year in range(start_year, end_year + 1):
driver.get(build_url(year))
...You could change
try_parse_float
to:def try_parse_float(x):
try:
return float(x)
except ValueError:
return 0.0so it is more robust, and possibly a tiny bit faster if most of the values are actually floats.
Did you read the note on
os.makedirs
?
Note:
makedirs()
will become confused if the path elements to create includepardir
(eg. âÂÂ..â on UNIX systems).Instead, I recommend using
dirname
again:file_folder = os.path.abspath(os.path.dirname(__file__))
csv_path = os.path.join(os.path.dirname(file_folder), 'data')
os.makedirs(csv_path, exist_ok=True)
add a comment |Â
up vote
5
down vote
up vote
5
down vote
A few more comment to add after @Josiah good answer:
You can simplify the driver management in
scrape_nhl_standings
by using a context manager. It seems like selenium doesn't support it natively but you can make use ofcontextlib.closing
:from contextlib import closing
def scrape_nhl_standings(csv_dump_path,start_year, end_year):
with closing(webdriver.Chrome()) as driver:
for year in range(start_year, end_year + 1):
driver.get(build_url(year))
...You could change
try_parse_float
to:def try_parse_float(x):
try:
return float(x)
except ValueError:
return 0.0so it is more robust, and possibly a tiny bit faster if most of the values are actually floats.
Did you read the note on
os.makedirs
?
Note:
makedirs()
will become confused if the path elements to create includepardir
(eg. âÂÂ..â on UNIX systems).Instead, I recommend using
dirname
again:file_folder = os.path.abspath(os.path.dirname(__file__))
csv_path = os.path.join(os.path.dirname(file_folder), 'data')
os.makedirs(csv_path, exist_ok=True)
A few more comment to add after @Josiah good answer:
You can simplify the driver management in
scrape_nhl_standings
by using a context manager. It seems like selenium doesn't support it natively but you can make use ofcontextlib.closing
:from contextlib import closing
def scrape_nhl_standings(csv_dump_path,start_year, end_year):
with closing(webdriver.Chrome()) as driver:
for year in range(start_year, end_year + 1):
driver.get(build_url(year))
...You could change
try_parse_float
to:def try_parse_float(x):
try:
return float(x)
except ValueError:
return 0.0so it is more robust, and possibly a tiny bit faster if most of the values are actually floats.
Did you read the note on
os.makedirs
?
Note:
makedirs()
will become confused if the path elements to create includepardir
(eg. âÂÂ..â on UNIX systems).Instead, I recommend using
dirname
again:file_folder = os.path.abspath(os.path.dirname(__file__))
csv_path = os.path.join(os.path.dirname(file_folder), 'data')
os.makedirs(csv_path, exist_ok=True)
answered May 2 at 8:27
Mathias Ettinger
21.8k32876
21.8k32876
add a comment |Â
add a comment |Â
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f193087%2fscraping-nhl-individual-stats%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password