Finding a swimmer's competing position from a website
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
2
down vote
favorite
This is a code that allows a swimmer to know his position with respect to other competitors. It gets the swimmer's time from the meet's website, finds other competitors' times and shows the better ones along with the competitors' names and heats. I do not have access to the website's database, so I was forced to use string indexing.
A meet page looks like this: http://csz-eg.org/images/stories/Contents/2018/Swimming/CSSC2018/Heats/HTML/Day_11-2.htm
We have the first part of the URL entered (http://csz-eg.org/images/stories/Contents/2018/Swimming/) then we concatenate the rest.
from urllib.request import *
from urllib.error import *
# Global variables that are used in multiple functions
html = ""
event_start = 0
event_end = 0
age = ""
own_time = ""
final_times_dict =
def nameheat_getter(time): # (time is hh:mm:ss:tt) gets the name and heat of a certain time's owner
time_index = html.find(time[6:])
name_index = time_index - 61
name = html[name_index: name_index + 20]
heat_index = html.rfind("Heat", 0, name_index) + 5
heat = html[heat_index: heat_index + 2]
return [name, heat]
def final_print(finaltimes): # Prints the seed position of the swimmer and better swimmers' times
print("===============nYour seed position is " + str(len(finaltimes) + 1) + "n===============")
print("nCompetitors with better times: n")
for item in finaltimes:
item = final_times_dict[item]
nameheat = nameheat_getter(item)
print("Name: " + nameheat[0])
print("Heat: " + nameheat[1])
print("Time: " + item)
print()
def converter(item): # Converts the time string to hours, minutes, seconds and milliseconds to be able to compare them
hours = int(item[0:2])
minutes = int(item[3:5])
seconds = int(item[6:8])
ms = int(item[9:])
return float(hours*60*60 + minutes*60 + seconds + ms/100)
def comparer(timesx): # Compares the swimmer's time with the competitor's times, appends the better times to a list.
own_time_converted = converter(own_time)
for item in timesx:
if converter(item) < own_time_converted:
final_times_dict[converter(item)] = item
final_times_lst = sorted(final_times_dict)
final_print(final_times_lst)
def times_grouper(): # puts the competitors' times in a list
itern = event_start
times =
while True:
age_index = html.find(age, itern, event_end)
if age_index == -1:
break
if html[age_index + 33] != ".":
itern = age_index + 1
continue
time = html[age_index + 25:age_index + 36]
times.append(time.replace(" ", "00:").replace(" ", "0"))
itern = age_index+1
comparer(times)
def geturl(): # Gets the race url and concatenates it with the site's URL
global html
try:
print("Complete the race URL (HTML)nhttp://csz-eg.org/images/stories/Contents/2018/Swimming/...")
html = urlopen("http://csz-eg.org/images/stories/Contents/2018/Swimming/" + input().replace(" ", ""))
.read().decode('utf-8')
base_manager(getname())
except HTTPError:
print("Invalid URL.")
main()
def getname(): # Get's the swimmer's name
print("Insert your name")
return input()
def base_manager(string): # Gets the swimmer's age, his time, the event starting index and the event ending index
global event_start, age, event_end, own_time
try:
namei = html.lower().index(string.lower())
nos = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
if html[namei - 3] not in nos:
raise ValueError
age = html[namei + 30:namei + 32]
own_time = html[namei + 55: namei + 66].replace(" ", "00:").replace(" ", "0")
event_start = html.rfind("=", 0, namei)
event_end = html.find("Event", namei)
if own_time == "00:00:00:NT":
print("You have no time.")
own_time = "99:99:99:99"
else:
print("Your own time is " + own_time)
times_grouper()
except ValueError:
print("Name not found.")
base_manager(getname())
def main(): # The main function
global html, event_start, event_end, age, own_time, final_times_dict
html = ""
event_start = 0
event_end = 0
age = ""
own_time = ""
final_times_dict =
geturl()
if __name__ == "__main__":
main()
When a swimmer's time is "NT", this means that the swimmer has no time, thus we consider all of the competitors better than him. The fastest swimmers are in the later heats while the slower ones are in the first heats.
A swimmer's competitors are the ones with the same age and better (smaller) times.
We check the swimmer's event by reverse finding the first "=" symbol since every event header ends with it and it is a distinct character.
We validate the swimmer's name by checking the lane number which is two characters before his name.
We check the competitors heats by reverse finding the word "heat" (the heat number would be two characters after it) which is also a distinct string.
We print the swimmer's own time, his position and the competitors' heats and times.
How can I make this code better/more optimized/smaller/more readable?
python python-3.x strings formatting web-scraping
add a comment |Â
up vote
2
down vote
favorite
This is a code that allows a swimmer to know his position with respect to other competitors. It gets the swimmer's time from the meet's website, finds other competitors' times and shows the better ones along with the competitors' names and heats. I do not have access to the website's database, so I was forced to use string indexing.
A meet page looks like this: http://csz-eg.org/images/stories/Contents/2018/Swimming/CSSC2018/Heats/HTML/Day_11-2.htm
We have the first part of the URL entered (http://csz-eg.org/images/stories/Contents/2018/Swimming/) then we concatenate the rest.
from urllib.request import *
from urllib.error import *
# Global variables that are used in multiple functions
html = ""
event_start = 0
event_end = 0
age = ""
own_time = ""
final_times_dict =
def nameheat_getter(time): # (time is hh:mm:ss:tt) gets the name and heat of a certain time's owner
time_index = html.find(time[6:])
name_index = time_index - 61
name = html[name_index: name_index + 20]
heat_index = html.rfind("Heat", 0, name_index) + 5
heat = html[heat_index: heat_index + 2]
return [name, heat]
def final_print(finaltimes): # Prints the seed position of the swimmer and better swimmers' times
print("===============nYour seed position is " + str(len(finaltimes) + 1) + "n===============")
print("nCompetitors with better times: n")
for item in finaltimes:
item = final_times_dict[item]
nameheat = nameheat_getter(item)
print("Name: " + nameheat[0])
print("Heat: " + nameheat[1])
print("Time: " + item)
print()
def converter(item): # Converts the time string to hours, minutes, seconds and milliseconds to be able to compare them
hours = int(item[0:2])
minutes = int(item[3:5])
seconds = int(item[6:8])
ms = int(item[9:])
return float(hours*60*60 + minutes*60 + seconds + ms/100)
def comparer(timesx): # Compares the swimmer's time with the competitor's times, appends the better times to a list.
own_time_converted = converter(own_time)
for item in timesx:
if converter(item) < own_time_converted:
final_times_dict[converter(item)] = item
final_times_lst = sorted(final_times_dict)
final_print(final_times_lst)
def times_grouper(): # puts the competitors' times in a list
itern = event_start
times =
while True:
age_index = html.find(age, itern, event_end)
if age_index == -1:
break
if html[age_index + 33] != ".":
itern = age_index + 1
continue
time = html[age_index + 25:age_index + 36]
times.append(time.replace(" ", "00:").replace(" ", "0"))
itern = age_index+1
comparer(times)
def geturl(): # Gets the race url and concatenates it with the site's URL
global html
try:
print("Complete the race URL (HTML)nhttp://csz-eg.org/images/stories/Contents/2018/Swimming/...")
html = urlopen("http://csz-eg.org/images/stories/Contents/2018/Swimming/" + input().replace(" ", ""))
.read().decode('utf-8')
base_manager(getname())
except HTTPError:
print("Invalid URL.")
main()
def getname(): # Get's the swimmer's name
print("Insert your name")
return input()
def base_manager(string): # Gets the swimmer's age, his time, the event starting index and the event ending index
global event_start, age, event_end, own_time
try:
namei = html.lower().index(string.lower())
nos = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
if html[namei - 3] not in nos:
raise ValueError
age = html[namei + 30:namei + 32]
own_time = html[namei + 55: namei + 66].replace(" ", "00:").replace(" ", "0")
event_start = html.rfind("=", 0, namei)
event_end = html.find("Event", namei)
if own_time == "00:00:00:NT":
print("You have no time.")
own_time = "99:99:99:99"
else:
print("Your own time is " + own_time)
times_grouper()
except ValueError:
print("Name not found.")
base_manager(getname())
def main(): # The main function
global html, event_start, event_end, age, own_time, final_times_dict
html = ""
event_start = 0
event_end = 0
age = ""
own_time = ""
final_times_dict =
geturl()
if __name__ == "__main__":
main()
When a swimmer's time is "NT", this means that the swimmer has no time, thus we consider all of the competitors better than him. The fastest swimmers are in the later heats while the slower ones are in the first heats.
A swimmer's competitors are the ones with the same age and better (smaller) times.
We check the swimmer's event by reverse finding the first "=" symbol since every event header ends with it and it is a distinct character.
We validate the swimmer's name by checking the lane number which is two characters before his name.
We check the competitors heats by reverse finding the word "heat" (the heat number would be two characters after it) which is also a distinct string.
We print the swimmer's own time, his position and the competitors' heats and times.
How can I make this code better/more optimized/smaller/more readable?
python python-3.x strings formatting web-scraping
add a comment |Â
up vote
2
down vote
favorite
up vote
2
down vote
favorite
This is a code that allows a swimmer to know his position with respect to other competitors. It gets the swimmer's time from the meet's website, finds other competitors' times and shows the better ones along with the competitors' names and heats. I do not have access to the website's database, so I was forced to use string indexing.
A meet page looks like this: http://csz-eg.org/images/stories/Contents/2018/Swimming/CSSC2018/Heats/HTML/Day_11-2.htm
We have the first part of the URL entered (http://csz-eg.org/images/stories/Contents/2018/Swimming/) then we concatenate the rest.
from urllib.request import *
from urllib.error import *
# Global variables that are used in multiple functions
html = ""
event_start = 0
event_end = 0
age = ""
own_time = ""
final_times_dict =
def nameheat_getter(time): # (time is hh:mm:ss:tt) gets the name and heat of a certain time's owner
time_index = html.find(time[6:])
name_index = time_index - 61
name = html[name_index: name_index + 20]
heat_index = html.rfind("Heat", 0, name_index) + 5
heat = html[heat_index: heat_index + 2]
return [name, heat]
def final_print(finaltimes): # Prints the seed position of the swimmer and better swimmers' times
print("===============nYour seed position is " + str(len(finaltimes) + 1) + "n===============")
print("nCompetitors with better times: n")
for item in finaltimes:
item = final_times_dict[item]
nameheat = nameheat_getter(item)
print("Name: " + nameheat[0])
print("Heat: " + nameheat[1])
print("Time: " + item)
print()
def converter(item): # Converts the time string to hours, minutes, seconds and milliseconds to be able to compare them
hours = int(item[0:2])
minutes = int(item[3:5])
seconds = int(item[6:8])
ms = int(item[9:])
return float(hours*60*60 + minutes*60 + seconds + ms/100)
def comparer(timesx): # Compares the swimmer's time with the competitor's times, appends the better times to a list.
own_time_converted = converter(own_time)
for item in timesx:
if converter(item) < own_time_converted:
final_times_dict[converter(item)] = item
final_times_lst = sorted(final_times_dict)
final_print(final_times_lst)
def times_grouper(): # puts the competitors' times in a list
itern = event_start
times =
while True:
age_index = html.find(age, itern, event_end)
if age_index == -1:
break
if html[age_index + 33] != ".":
itern = age_index + 1
continue
time = html[age_index + 25:age_index + 36]
times.append(time.replace(" ", "00:").replace(" ", "0"))
itern = age_index+1
comparer(times)
def geturl(): # Gets the race url and concatenates it with the site's URL
global html
try:
print("Complete the race URL (HTML)nhttp://csz-eg.org/images/stories/Contents/2018/Swimming/...")
html = urlopen("http://csz-eg.org/images/stories/Contents/2018/Swimming/" + input().replace(" ", ""))
.read().decode('utf-8')
base_manager(getname())
except HTTPError:
print("Invalid URL.")
main()
def getname(): # Get's the swimmer's name
print("Insert your name")
return input()
def base_manager(string): # Gets the swimmer's age, his time, the event starting index and the event ending index
global event_start, age, event_end, own_time
try:
namei = html.lower().index(string.lower())
nos = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
if html[namei - 3] not in nos:
raise ValueError
age = html[namei + 30:namei + 32]
own_time = html[namei + 55: namei + 66].replace(" ", "00:").replace(" ", "0")
event_start = html.rfind("=", 0, namei)
event_end = html.find("Event", namei)
if own_time == "00:00:00:NT":
print("You have no time.")
own_time = "99:99:99:99"
else:
print("Your own time is " + own_time)
times_grouper()
except ValueError:
print("Name not found.")
base_manager(getname())
def main(): # The main function
global html, event_start, event_end, age, own_time, final_times_dict
html = ""
event_start = 0
event_end = 0
age = ""
own_time = ""
final_times_dict =
geturl()
if __name__ == "__main__":
main()
When a swimmer's time is "NT", this means that the swimmer has no time, thus we consider all of the competitors better than him. The fastest swimmers are in the later heats while the slower ones are in the first heats.
A swimmer's competitors are the ones with the same age and better (smaller) times.
We check the swimmer's event by reverse finding the first "=" symbol since every event header ends with it and it is a distinct character.
We validate the swimmer's name by checking the lane number which is two characters before his name.
We check the competitors heats by reverse finding the word "heat" (the heat number would be two characters after it) which is also a distinct string.
We print the swimmer's own time, his position and the competitors' heats and times.
How can I make this code better/more optimized/smaller/more readable?
python python-3.x strings formatting web-scraping
This is a code that allows a swimmer to know his position with respect to other competitors. It gets the swimmer's time from the meet's website, finds other competitors' times and shows the better ones along with the competitors' names and heats. I do not have access to the website's database, so I was forced to use string indexing.
A meet page looks like this: http://csz-eg.org/images/stories/Contents/2018/Swimming/CSSC2018/Heats/HTML/Day_11-2.htm
We have the first part of the URL entered (http://csz-eg.org/images/stories/Contents/2018/Swimming/) then we concatenate the rest.
from urllib.request import *
from urllib.error import *
# Global variables that are used in multiple functions
html = ""
event_start = 0
event_end = 0
age = ""
own_time = ""
final_times_dict =
def nameheat_getter(time): # (time is hh:mm:ss:tt) gets the name and heat of a certain time's owner
time_index = html.find(time[6:])
name_index = time_index - 61
name = html[name_index: name_index + 20]
heat_index = html.rfind("Heat", 0, name_index) + 5
heat = html[heat_index: heat_index + 2]
return [name, heat]
def final_print(finaltimes): # Prints the seed position of the swimmer and better swimmers' times
print("===============nYour seed position is " + str(len(finaltimes) + 1) + "n===============")
print("nCompetitors with better times: n")
for item in finaltimes:
item = final_times_dict[item]
nameheat = nameheat_getter(item)
print("Name: " + nameheat[0])
print("Heat: " + nameheat[1])
print("Time: " + item)
print()
def converter(item): # Converts the time string to hours, minutes, seconds and milliseconds to be able to compare them
hours = int(item[0:2])
minutes = int(item[3:5])
seconds = int(item[6:8])
ms = int(item[9:])
return float(hours*60*60 + minutes*60 + seconds + ms/100)
def comparer(timesx): # Compares the swimmer's time with the competitor's times, appends the better times to a list.
own_time_converted = converter(own_time)
for item in timesx:
if converter(item) < own_time_converted:
final_times_dict[converter(item)] = item
final_times_lst = sorted(final_times_dict)
final_print(final_times_lst)
def times_grouper(): # puts the competitors' times in a list
itern = event_start
times =
while True:
age_index = html.find(age, itern, event_end)
if age_index == -1:
break
if html[age_index + 33] != ".":
itern = age_index + 1
continue
time = html[age_index + 25:age_index + 36]
times.append(time.replace(" ", "00:").replace(" ", "0"))
itern = age_index+1
comparer(times)
def geturl(): # Gets the race url and concatenates it with the site's URL
global html
try:
print("Complete the race URL (HTML)nhttp://csz-eg.org/images/stories/Contents/2018/Swimming/...")
html = urlopen("http://csz-eg.org/images/stories/Contents/2018/Swimming/" + input().replace(" ", ""))
.read().decode('utf-8')
base_manager(getname())
except HTTPError:
print("Invalid URL.")
main()
def getname(): # Get's the swimmer's name
print("Insert your name")
return input()
def base_manager(string): # Gets the swimmer's age, his time, the event starting index and the event ending index
global event_start, age, event_end, own_time
try:
namei = html.lower().index(string.lower())
nos = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
if html[namei - 3] not in nos:
raise ValueError
age = html[namei + 30:namei + 32]
own_time = html[namei + 55: namei + 66].replace(" ", "00:").replace(" ", "0")
event_start = html.rfind("=", 0, namei)
event_end = html.find("Event", namei)
if own_time == "00:00:00:NT":
print("You have no time.")
own_time = "99:99:99:99"
else:
print("Your own time is " + own_time)
times_grouper()
except ValueError:
print("Name not found.")
base_manager(getname())
def main(): # The main function
global html, event_start, event_end, age, own_time, final_times_dict
html = ""
event_start = 0
event_end = 0
age = ""
own_time = ""
final_times_dict =
geturl()
if __name__ == "__main__":
main()
When a swimmer's time is "NT", this means that the swimmer has no time, thus we consider all of the competitors better than him. The fastest swimmers are in the later heats while the slower ones are in the first heats.
A swimmer's competitors are the ones with the same age and better (smaller) times.
We check the swimmer's event by reverse finding the first "=" symbol since every event header ends with it and it is a distinct character.
We validate the swimmer's name by checking the lane number which is two characters before his name.
We check the competitors heats by reverse finding the word "heat" (the heat number would be two characters after it) which is also a distinct string.
We print the swimmer's own time, his position and the competitors' heats and times.
How can I make this code better/more optimized/smaller/more readable?
python python-3.x strings formatting web-scraping
asked Jul 13 at 18:18
Omar.B
754
754
add a comment |Â
add a comment |Â
1 Answer
1
active
oldest
votes
up vote
3
down vote
accepted
This is good code for a first stab. Let's dive in and see what can be improved.
I have one overarching recommendation. Scraping is a hard task because it incurs tech debt (in the form of if the source page format changes at all in a way that breaks one of your assumptions, you may have to completely rewrite parts of your code). It looks like this document is auto-generated, so its format is probably fairly stable. But I've made that assumption many times before and it's come back to bite me later.
I'd recommend separating concerns in your script. Right now you interweave all of your concerns (getting user input, downloading the page, parsing the page, extracting the desired information, and displaying output). Each of these concerns has its own error conditions and reasons to change, and either of those happening now likely requires a large rewrite on your part.
Instead, look to separate these concerns into "modules" (not necessarily Python modules, but perhaps functions or classes) with clear API boundaries. In this way, you can test them individually (integration/unit testing is especially important for the scraping component) and if any need to change, it will be an isolated change instead of a massive overhaul.
As a teaser, wouldn't it be nice if you main looked like this (forgive me if I miss some details from your script, another consequence about not separate concerns is that it's harder to grok code):
import sys
def main():
try:
standings = get_swimmer_standing(args.swimmer_name, args.race_url)
for i, swimmer in enumerate(standings):
print(f'i+1. swimmer.name swimmer.time')
except SwimmingDataError as e:
print(e, file=sys.stderr)
sys.exit(1)
Digging further let's see what a nice get_swimmer_standing
would look like:
from operator import attrgetter
def get_swimmer_standing(swimmer_name, race_url):
"""Returns the standings of the event in which swimmer_name participated.
Retrieves race information from race_url, locates the event in which
swimmer_name participated and returns a list of faster swimmers sorted
by time (includes the requested swimmer).
"""
race_page = RacePage.parse(get_race_page(race_url))
event, swimmer = find_swimmer_event(race_page, swimmer_name)
standings = list(sorted(event.swimmers, key=attrgetter('time')))
return standings[:standings.find(swimmer)+1]
We note that get_race_page
can do the urllib
stuff you did previously and should return the contents of the page or raise a subclass of SwimmerDataError
.
Let's then dig into find_swimmer_event
:
def find_swimmer_event(race_page, swimmer_name):
"""Returns the Swimmer with swimmer_name and Event they participated in on race_page."""
for event in race_page.events:
for swimmer in event.swimmers:
if swimmer.name == swimmer_name:
return event, swimmer
raise SwimmerNotFoundError(swimmer_name)
What's important to note here is that, so far, we've been dealing at the right levels of abstraction and we've clearly handled our error cases. "Modules" (functions up to this point) do one task and do it well (see Single Responsibility Principle).
Also note that I've been dreaming up this API as I go. At the point of writing this sentence, I haven't yet considered any of the details of the lower levels (ex. parsing of the page). Instead, I am deciding the best API for accessing the data the higher levels need and using that to inform my decisions of how to design lower levels.
Now that we'll start talking about scraping, let's first formulate the data classes we'll need. These are preferable to global variables like event_start
, event_end
, and own_time
because they keep related data together and give a uniform API for querying it.
Note that I'm using dataclasses here (which are Python 3.7), but you could use namedtuples on 3.6 (it's just the syntax for giving them methods is a little messy).
from typing import List
@dataclass(frozen=True)
class RacePage:
events: List[Event]
@dataclass(frozen=True)
class Event:
num: int
name: str
heats: List[Heat]
@property
def swimmers():
for heat in self.heats:
yield from heat.swimmers
@dataclass(frozen=True)
class Heat:
num: int
swimmers: List[Swimmer]
@dataclass(frozen=True)
class Swimmer:
lane: int
name: str
time: RaceTime
The above code alone would just work with the functions we defined above. The data classes can (and usually should be) for the most part dumb containers. Their power is in defining a nice API that will allow us to make other queries later should we need to (ex. "who is the youngest swimmer?").
Now for the actual parsing, we're again going to try to limit responsibility. RacePage.parse
should only be concerned with trimming off the header and separating the page into several strings beginning with "Event "
. It then delegates the parsing of each event to Event.parse
. That, in turn, is responsible for extracting the event's name, trimming the header, and splitting the remaining texts into strings beginning with "Heat "
. Then it defers to Heat.parse
. That should extract the heat number, trim the header, and split the lines, delegating to Swimmer.parse
. Do you see the pattern here?
Let's see just how simple these parsers can be if we continue to separate concerns and work at the right level of abstraction:
import re
@dataclass(frozen=True)
class RacePage:
events: List[Event]
@classmethod
def parse(cls, page_content):
# First part is the header before the first event
events = split_on(r'^Event d+ ', page_content)[1:]
return cls(list(map(Event.parse, events)))
@dataclass(frozen=True)
class Event:
num: int
name: str
heats: List[Heat]
@classmethod
def parse(cls, event_content):
# Parse first line for event number and name
# Event 123 Olympic Freestyle Trials
m = re.match(r'Event +(?P<num>d+) +(?<name>[^n]+)', event_content)
num = int(m.group('num'))
name = m.group('name').strip()
# The first part is the event header before the first heat
heats = split_on(r'Heat d+ ', event_content)[1:]
return cls(num, name, list(map(Heat.parse, heats)))
# ...
@dataclass(frozen=True)
class Heat:
num: int
swimmers: List[Swimmer]
@classmethod
def parse(cls, heat_content):
# Parse heat number from the header
# Heat 1 of 10 Timed Finals Starts at 05:29 PM
m = re.match(r'Heat (?<num>d+) ', heat_content)
num = int(m.group('num'))
# The first line is the heat header before the first swimmer
swimmers = heat_content.split('n')[1:]
return cls(num, list(map(Swimmer.parse, swimmers)))
@dataclass(frozen=True)
class Swimmer:
lane: int
name: str
time: RaceTime
@classmethod
def parse(cls, swimmer_line):
# Parses lane number, name, and time from heat line
# 5 Michael Phelps 28 USA 50.58
# Note: assumes names don't have numbers in them
m = re.match(r' *(?<lane>d+) +(?P<name>D+) +(?<rest>.+)', swimmer_line)
lane = int(m.group('lane'))
name = m.group('name')
rest = m.group('rest')
time = RaceTime.parse(rest.split()[-1])
return cls(lane, name, time)
Note that these parse methods are very simple. I've used regexes to keep things clean. Also note that I collect more information than you need (which you can choose to omit). A particular strategy that's important for longevity is to be extremely forgiving about formatting. So for example, since all we need for Simmer
is the first number and name and the last time, we are careful to not make any assumptions about what is in between. In particular, once we pull off the lane number and name, with the rest we do rest.split()[-1]
to get the last segment without having to care about anything before it. This allows the middle to change formats entirely as long as the lane number, name, and time remain in their expected positions.
Notice the use of split_on
. This is a helper that I've yet to write, but identified as a common task done in a few places. It's better to pull this out into a small helper function (that can be tested on its own). While I was writing the parsers, I didn't concern myself with how this function would work. Now that I am, I recognize that this is nearly a function provided by python already:
import re
def split_on(regex, s):
"""Split a string every time regex is matched (at the beginning of each line)."""
return re.split(regex, s, flags=re.MULTILINE)
Now all that's left is RaceTime
and RaceTime.parse
. Ideally, we want a construction that's automatically sortable (dataclasses
and namedtuples
are). I'd recommend using a Decimal
(not float, they can't represent some numbers accurately) for number of seconds. This will be automatically sortable, easy to display, and easy to perform math with.
from decimal import Decimal
@dataclass(frozen=True)
class RaceTime:
elapsed_seconds: Decimal
def __str__(self):
seconds = self.elapsed_seconds % 60
minutes = self.elapsed_seconds // 60
return f'minutes:seconds' if minutes > 0 else f'seconds'
Parsing will give you a nice idea of why all this separation is good. Now that we have parsing just times out into a single location, we can test it alone. And that means it's much easier to make sure we handle things like NT
and the weird X
prefix in front of some times.
from decimal import Decimal
@dataclass(frozen=True)
class RaceTime:
elapsed_seconds: Decimal
@classmethod
def parse(cls, time_str):
if time_str == 'NT':
return NO_TIME
try:
mins, secs = time_str.split(':', 1)
except ValueError:
secs = time_str
mins = 0
return RaceTime(int(mins) * 60 + Decimal(secs))
# ...
class NoTime(RaceTime):
def __lt__(self, other):
return False
NO_TIME = NoTime()
Admittedly, the NO_TIME
is about the only of this that I don't feel the greatest about, but it's better than just using None
(because by that can't be sorted).
Now you can write some good unit tests for these small bits:
from unittest import TestCase
# from x import RaceTime, NO_TIME
class TestRaceTime(TestCase):
def test_parse(self):
self.assertEqual(Decimal('105.45') == RaceTime.parse('1:45.45').elapsed_time)
self.assertEqual(NO_TIME == RaceTime.parse('NT'))
That should about do it. Note how this code may not be shorter in terms of lines, but it is definitely shorter horizontally. Each "module" handles only one specific task and has defined API boundaries. Another advantage of these is that if later something were to fail in the script you'd get a fantastic traceback, something like "ValueError on line x in swimdata/heap.py" and the line would indicate the error occurred in Heat.parse()
so you'd know exactly where to look on the race page to see where the format changed.
In theory, everything above should "just work," but I haven't tested it. It's fairly simple though, which was another goal I had.
To motivate how you can use this to improve your code, here are the kinds of things I saw while reading your code that should be reflected in the example above:
- Use better names. What is
base_manager
? That doesn't tell me what it does.namei
?name_position
orname_index
may be better. These names are easier to understand for others reading your code (and maybe you later) - Avoid
global
. It is very hard to read code that uses (or worse mutates) global state, because then you have to scroll all over the place to see where data came from and what format it is in - As an aside to above, a lot of the places where you use globals you could've just passed as parameters (ex.
times_group()
called frombase_manager()
) - Separate IO from data processing (what if later you wanted to provide this script as a library for others to use? surely they wouldn't want it to spuriously print things)
- For input you can use a prompt string:
input('Your name: ')
- For errors, perhaps consider printing the error to stderr and returning an error code so your script is a bit easier to interface with from the CLI.
- Check out PEP8. Your formmating isn't too bad, but you have some long lines. PEP8 will force good formatting habits that hopefully encourage writing clearer code
- Usually we use single quotes for everything that isn't 3.6 format strings (ex.
" bar".format(foo)
) - You may want to use requests instead of urllib. In particular, it may not be safe to assume the content is UTF-8. Instead let the library decode it for you using the
Content-Type
. - Prefer
for
loops towhile
loops with index variables (ex.times_grouper()
) - You are doing a lot of work that you can leave to python (number parsing for example in
base_manager
)
Regexes may help make some of your parsing code a little cleaner- You make a lot of fragile assumptions about the data that you care about (number of digits, exact spacing). This is more likely to break than general assumptions about page format.
- There are lots of magic numbers in your code. Try to avoid them or use constants where they are needed to document their purpose. Especially for scraping, it is important to not rely on numbers like this because they are likely to change.
- Document what functions do (input, output, side effects) in
"""Doc comments."""
Write it from the perspective of someone wanting to use your function. What would they want to know. - Use
# inline comments
to explain certain lines that may not be immediately obvious (ex. see above how I explain why I slice off the first parts in the parsers)
That's about all I have time for, but hopefully that's helpful. If you'd like more pointers, I can make a gist with all of this nicely separated into modules and with examples of how to test it.
add a comment |Â
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
3
down vote
accepted
This is good code for a first stab. Let's dive in and see what can be improved.
I have one overarching recommendation. Scraping is a hard task because it incurs tech debt (in the form of if the source page format changes at all in a way that breaks one of your assumptions, you may have to completely rewrite parts of your code). It looks like this document is auto-generated, so its format is probably fairly stable. But I've made that assumption many times before and it's come back to bite me later.
I'd recommend separating concerns in your script. Right now you interweave all of your concerns (getting user input, downloading the page, parsing the page, extracting the desired information, and displaying output). Each of these concerns has its own error conditions and reasons to change, and either of those happening now likely requires a large rewrite on your part.
Instead, look to separate these concerns into "modules" (not necessarily Python modules, but perhaps functions or classes) with clear API boundaries. In this way, you can test them individually (integration/unit testing is especially important for the scraping component) and if any need to change, it will be an isolated change instead of a massive overhaul.
As a teaser, wouldn't it be nice if you main looked like this (forgive me if I miss some details from your script, another consequence about not separate concerns is that it's harder to grok code):
import sys
def main():
try:
standings = get_swimmer_standing(args.swimmer_name, args.race_url)
for i, swimmer in enumerate(standings):
print(f'i+1. swimmer.name swimmer.time')
except SwimmingDataError as e:
print(e, file=sys.stderr)
sys.exit(1)
Digging further let's see what a nice get_swimmer_standing
would look like:
from operator import attrgetter
def get_swimmer_standing(swimmer_name, race_url):
"""Returns the standings of the event in which swimmer_name participated.
Retrieves race information from race_url, locates the event in which
swimmer_name participated and returns a list of faster swimmers sorted
by time (includes the requested swimmer).
"""
race_page = RacePage.parse(get_race_page(race_url))
event, swimmer = find_swimmer_event(race_page, swimmer_name)
standings = list(sorted(event.swimmers, key=attrgetter('time')))
return standings[:standings.find(swimmer)+1]
We note that get_race_page
can do the urllib
stuff you did previously and should return the contents of the page or raise a subclass of SwimmerDataError
.
Let's then dig into find_swimmer_event
:
def find_swimmer_event(race_page, swimmer_name):
"""Returns the Swimmer with swimmer_name and Event they participated in on race_page."""
for event in race_page.events:
for swimmer in event.swimmers:
if swimmer.name == swimmer_name:
return event, swimmer
raise SwimmerNotFoundError(swimmer_name)
What's important to note here is that, so far, we've been dealing at the right levels of abstraction and we've clearly handled our error cases. "Modules" (functions up to this point) do one task and do it well (see Single Responsibility Principle).
Also note that I've been dreaming up this API as I go. At the point of writing this sentence, I haven't yet considered any of the details of the lower levels (ex. parsing of the page). Instead, I am deciding the best API for accessing the data the higher levels need and using that to inform my decisions of how to design lower levels.
Now that we'll start talking about scraping, let's first formulate the data classes we'll need. These are preferable to global variables like event_start
, event_end
, and own_time
because they keep related data together and give a uniform API for querying it.
Note that I'm using dataclasses here (which are Python 3.7), but you could use namedtuples on 3.6 (it's just the syntax for giving them methods is a little messy).
from typing import List
@dataclass(frozen=True)
class RacePage:
events: List[Event]
@dataclass(frozen=True)
class Event:
num: int
name: str
heats: List[Heat]
@property
def swimmers():
for heat in self.heats:
yield from heat.swimmers
@dataclass(frozen=True)
class Heat:
num: int
swimmers: List[Swimmer]
@dataclass(frozen=True)
class Swimmer:
lane: int
name: str
time: RaceTime
The above code alone would just work with the functions we defined above. The data classes can (and usually should be) for the most part dumb containers. Their power is in defining a nice API that will allow us to make other queries later should we need to (ex. "who is the youngest swimmer?").
Now for the actual parsing, we're again going to try to limit responsibility. RacePage.parse
should only be concerned with trimming off the header and separating the page into several strings beginning with "Event "
. It then delegates the parsing of each event to Event.parse
. That, in turn, is responsible for extracting the event's name, trimming the header, and splitting the remaining texts into strings beginning with "Heat "
. Then it defers to Heat.parse
. That should extract the heat number, trim the header, and split the lines, delegating to Swimmer.parse
. Do you see the pattern here?
Let's see just how simple these parsers can be if we continue to separate concerns and work at the right level of abstraction:
import re
@dataclass(frozen=True)
class RacePage:
events: List[Event]
@classmethod
def parse(cls, page_content):
# First part is the header before the first event
events = split_on(r'^Event d+ ', page_content)[1:]
return cls(list(map(Event.parse, events)))
@dataclass(frozen=True)
class Event:
num: int
name: str
heats: List[Heat]
@classmethod
def parse(cls, event_content):
# Parse first line for event number and name
# Event 123 Olympic Freestyle Trials
m = re.match(r'Event +(?P<num>d+) +(?<name>[^n]+)', event_content)
num = int(m.group('num'))
name = m.group('name').strip()
# The first part is the event header before the first heat
heats = split_on(r'Heat d+ ', event_content)[1:]
return cls(num, name, list(map(Heat.parse, heats)))
# ...
@dataclass(frozen=True)
class Heat:
num: int
swimmers: List[Swimmer]
@classmethod
def parse(cls, heat_content):
# Parse heat number from the header
# Heat 1 of 10 Timed Finals Starts at 05:29 PM
m = re.match(r'Heat (?<num>d+) ', heat_content)
num = int(m.group('num'))
# The first line is the heat header before the first swimmer
swimmers = heat_content.split('n')[1:]
return cls(num, list(map(Swimmer.parse, swimmers)))
@dataclass(frozen=True)
class Swimmer:
lane: int
name: str
time: RaceTime
@classmethod
def parse(cls, swimmer_line):
# Parses lane number, name, and time from heat line
# 5 Michael Phelps 28 USA 50.58
# Note: assumes names don't have numbers in them
m = re.match(r' *(?<lane>d+) +(?P<name>D+) +(?<rest>.+)', swimmer_line)
lane = int(m.group('lane'))
name = m.group('name')
rest = m.group('rest')
time = RaceTime.parse(rest.split()[-1])
return cls(lane, name, time)
Note that these parse methods are very simple. I've used regexes to keep things clean. Also note that I collect more information than you need (which you can choose to omit). A particular strategy that's important for longevity is to be extremely forgiving about formatting. So for example, since all we need for Simmer
is the first number and name and the last time, we are careful to not make any assumptions about what is in between. In particular, once we pull off the lane number and name, with the rest we do rest.split()[-1]
to get the last segment without having to care about anything before it. This allows the middle to change formats entirely as long as the lane number, name, and time remain in their expected positions.
Notice the use of split_on
. This is a helper that I've yet to write, but identified as a common task done in a few places. It's better to pull this out into a small helper function (that can be tested on its own). While I was writing the parsers, I didn't concern myself with how this function would work. Now that I am, I recognize that this is nearly a function provided by python already:
import re
def split_on(regex, s):
"""Split a string every time regex is matched (at the beginning of each line)."""
return re.split(regex, s, flags=re.MULTILINE)
Now all that's left is RaceTime
and RaceTime.parse
. Ideally, we want a construction that's automatically sortable (dataclasses
and namedtuples
are). I'd recommend using a Decimal
(not float, they can't represent some numbers accurately) for number of seconds. This will be automatically sortable, easy to display, and easy to perform math with.
from decimal import Decimal
@dataclass(frozen=True)
class RaceTime:
elapsed_seconds: Decimal
def __str__(self):
seconds = self.elapsed_seconds % 60
minutes = self.elapsed_seconds // 60
return f'minutes:seconds' if minutes > 0 else f'seconds'
Parsing will give you a nice idea of why all this separation is good. Now that we have parsing just times out into a single location, we can test it alone. And that means it's much easier to make sure we handle things like NT
and the weird X
prefix in front of some times.
from decimal import Decimal
@dataclass(frozen=True)
class RaceTime:
elapsed_seconds: Decimal
@classmethod
def parse(cls, time_str):
if time_str == 'NT':
return NO_TIME
try:
mins, secs = time_str.split(':', 1)
except ValueError:
secs = time_str
mins = 0
return RaceTime(int(mins) * 60 + Decimal(secs))
# ...
class NoTime(RaceTime):
def __lt__(self, other):
return False
NO_TIME = NoTime()
Admittedly, the NO_TIME
is about the only of this that I don't feel the greatest about, but it's better than just using None
(because by that can't be sorted).
Now you can write some good unit tests for these small bits:
from unittest import TestCase
# from x import RaceTime, NO_TIME
class TestRaceTime(TestCase):
def test_parse(self):
self.assertEqual(Decimal('105.45') == RaceTime.parse('1:45.45').elapsed_time)
self.assertEqual(NO_TIME == RaceTime.parse('NT'))
That should about do it. Note how this code may not be shorter in terms of lines, but it is definitely shorter horizontally. Each "module" handles only one specific task and has defined API boundaries. Another advantage of these is that if later something were to fail in the script you'd get a fantastic traceback, something like "ValueError on line x in swimdata/heap.py" and the line would indicate the error occurred in Heat.parse()
so you'd know exactly where to look on the race page to see where the format changed.
In theory, everything above should "just work," but I haven't tested it. It's fairly simple though, which was another goal I had.
To motivate how you can use this to improve your code, here are the kinds of things I saw while reading your code that should be reflected in the example above:
- Use better names. What is
base_manager
? That doesn't tell me what it does.namei
?name_position
orname_index
may be better. These names are easier to understand for others reading your code (and maybe you later) - Avoid
global
. It is very hard to read code that uses (or worse mutates) global state, because then you have to scroll all over the place to see where data came from and what format it is in - As an aside to above, a lot of the places where you use globals you could've just passed as parameters (ex.
times_group()
called frombase_manager()
) - Separate IO from data processing (what if later you wanted to provide this script as a library for others to use? surely they wouldn't want it to spuriously print things)
- For input you can use a prompt string:
input('Your name: ')
- For errors, perhaps consider printing the error to stderr and returning an error code so your script is a bit easier to interface with from the CLI.
- Check out PEP8. Your formmating isn't too bad, but you have some long lines. PEP8 will force good formatting habits that hopefully encourage writing clearer code
- Usually we use single quotes for everything that isn't 3.6 format strings (ex.
" bar".format(foo)
) - You may want to use requests instead of urllib. In particular, it may not be safe to assume the content is UTF-8. Instead let the library decode it for you using the
Content-Type
. - Prefer
for
loops towhile
loops with index variables (ex.times_grouper()
) - You are doing a lot of work that you can leave to python (number parsing for example in
base_manager
)
Regexes may help make some of your parsing code a little cleaner- You make a lot of fragile assumptions about the data that you care about (number of digits, exact spacing). This is more likely to break than general assumptions about page format.
- There are lots of magic numbers in your code. Try to avoid them or use constants where they are needed to document their purpose. Especially for scraping, it is important to not rely on numbers like this because they are likely to change.
- Document what functions do (input, output, side effects) in
"""Doc comments."""
Write it from the perspective of someone wanting to use your function. What would they want to know. - Use
# inline comments
to explain certain lines that may not be immediately obvious (ex. see above how I explain why I slice off the first parts in the parsers)
That's about all I have time for, but hopefully that's helpful. If you'd like more pointers, I can make a gist with all of this nicely separated into modules and with examples of how to test it.
add a comment |Â
up vote
3
down vote
accepted
This is good code for a first stab. Let's dive in and see what can be improved.
I have one overarching recommendation. Scraping is a hard task because it incurs tech debt (in the form of if the source page format changes at all in a way that breaks one of your assumptions, you may have to completely rewrite parts of your code). It looks like this document is auto-generated, so its format is probably fairly stable. But I've made that assumption many times before and it's come back to bite me later.
I'd recommend separating concerns in your script. Right now you interweave all of your concerns (getting user input, downloading the page, parsing the page, extracting the desired information, and displaying output). Each of these concerns has its own error conditions and reasons to change, and either of those happening now likely requires a large rewrite on your part.
Instead, look to separate these concerns into "modules" (not necessarily Python modules, but perhaps functions or classes) with clear API boundaries. In this way, you can test them individually (integration/unit testing is especially important for the scraping component) and if any need to change, it will be an isolated change instead of a massive overhaul.
As a teaser, wouldn't it be nice if you main looked like this (forgive me if I miss some details from your script, another consequence about not separate concerns is that it's harder to grok code):
import sys
def main():
try:
standings = get_swimmer_standing(args.swimmer_name, args.race_url)
for i, swimmer in enumerate(standings):
print(f'i+1. swimmer.name swimmer.time')
except SwimmingDataError as e:
print(e, file=sys.stderr)
sys.exit(1)
Digging further let's see what a nice get_swimmer_standing
would look like:
from operator import attrgetter
def get_swimmer_standing(swimmer_name, race_url):
"""Returns the standings of the event in which swimmer_name participated.
Retrieves race information from race_url, locates the event in which
swimmer_name participated and returns a list of faster swimmers sorted
by time (includes the requested swimmer).
"""
race_page = RacePage.parse(get_race_page(race_url))
event, swimmer = find_swimmer_event(race_page, swimmer_name)
standings = list(sorted(event.swimmers, key=attrgetter('time')))
return standings[:standings.find(swimmer)+1]
We note that get_race_page
can do the urllib
stuff you did previously and should return the contents of the page or raise a subclass of SwimmerDataError
.
Let's then dig into find_swimmer_event
:
def find_swimmer_event(race_page, swimmer_name):
"""Returns the Swimmer with swimmer_name and Event they participated in on race_page."""
for event in race_page.events:
for swimmer in event.swimmers:
if swimmer.name == swimmer_name:
return event, swimmer
raise SwimmerNotFoundError(swimmer_name)
What's important to note here is that, so far, we've been dealing at the right levels of abstraction and we've clearly handled our error cases. "Modules" (functions up to this point) do one task and do it well (see Single Responsibility Principle).
Also note that I've been dreaming up this API as I go. At the point of writing this sentence, I haven't yet considered any of the details of the lower levels (ex. parsing of the page). Instead, I am deciding the best API for accessing the data the higher levels need and using that to inform my decisions of how to design lower levels.
Now that we'll start talking about scraping, let's first formulate the data classes we'll need. These are preferable to global variables like event_start
, event_end
, and own_time
because they keep related data together and give a uniform API for querying it.
Note that I'm using dataclasses here (which are Python 3.7), but you could use namedtuples on 3.6 (it's just the syntax for giving them methods is a little messy).
from typing import List
@dataclass(frozen=True)
class RacePage:
events: List[Event]
@dataclass(frozen=True)
class Event:
num: int
name: str
heats: List[Heat]
@property
def swimmers():
for heat in self.heats:
yield from heat.swimmers
@dataclass(frozen=True)
class Heat:
num: int
swimmers: List[Swimmer]
@dataclass(frozen=True)
class Swimmer:
lane: int
name: str
time: RaceTime
The above code alone would just work with the functions we defined above. The data classes can (and usually should be) for the most part dumb containers. Their power is in defining a nice API that will allow us to make other queries later should we need to (ex. "who is the youngest swimmer?").
Now for the actual parsing, we're again going to try to limit responsibility. RacePage.parse
should only be concerned with trimming off the header and separating the page into several strings beginning with "Event "
. It then delegates the parsing of each event to Event.parse
. That, in turn, is responsible for extracting the event's name, trimming the header, and splitting the remaining texts into strings beginning with "Heat "
. Then it defers to Heat.parse
. That should extract the heat number, trim the header, and split the lines, delegating to Swimmer.parse
. Do you see the pattern here?
Let's see just how simple these parsers can be if we continue to separate concerns and work at the right level of abstraction:
import re
@dataclass(frozen=True)
class RacePage:
events: List[Event]
@classmethod
def parse(cls, page_content):
# First part is the header before the first event
events = split_on(r'^Event d+ ', page_content)[1:]
return cls(list(map(Event.parse, events)))
@dataclass(frozen=True)
class Event:
num: int
name: str
heats: List[Heat]
@classmethod
def parse(cls, event_content):
# Parse first line for event number and name
# Event 123 Olympic Freestyle Trials
m = re.match(r'Event +(?P<num>d+) +(?<name>[^n]+)', event_content)
num = int(m.group('num'))
name = m.group('name').strip()
# The first part is the event header before the first heat
heats = split_on(r'Heat d+ ', event_content)[1:]
return cls(num, name, list(map(Heat.parse, heats)))
# ...
@dataclass(frozen=True)
class Heat:
num: int
swimmers: List[Swimmer]
@classmethod
def parse(cls, heat_content):
# Parse heat number from the header
# Heat 1 of 10 Timed Finals Starts at 05:29 PM
m = re.match(r'Heat (?<num>d+) ', heat_content)
num = int(m.group('num'))
# The first line is the heat header before the first swimmer
swimmers = heat_content.split('n')[1:]
return cls(num, list(map(Swimmer.parse, swimmers)))
@dataclass(frozen=True)
class Swimmer:
lane: int
name: str
time: RaceTime
@classmethod
def parse(cls, swimmer_line):
# Parses lane number, name, and time from heat line
# 5 Michael Phelps 28 USA 50.58
# Note: assumes names don't have numbers in them
m = re.match(r' *(?<lane>d+) +(?P<name>D+) +(?<rest>.+)', swimmer_line)
lane = int(m.group('lane'))
name = m.group('name')
rest = m.group('rest')
time = RaceTime.parse(rest.split()[-1])
return cls(lane, name, time)
Note that these parse methods are very simple. I've used regexes to keep things clean. Also note that I collect more information than you need (which you can choose to omit). A particular strategy that's important for longevity is to be extremely forgiving about formatting. So for example, since all we need for Simmer
is the first number and name and the last time, we are careful to not make any assumptions about what is in between. In particular, once we pull off the lane number and name, with the rest we do rest.split()[-1]
to get the last segment without having to care about anything before it. This allows the middle to change formats entirely as long as the lane number, name, and time remain in their expected positions.
Notice the use of split_on
. This is a helper that I've yet to write, but identified as a common task done in a few places. It's better to pull this out into a small helper function (that can be tested on its own). While I was writing the parsers, I didn't concern myself with how this function would work. Now that I am, I recognize that this is nearly a function provided by python already:
import re
def split_on(regex, s):
"""Split a string every time regex is matched (at the beginning of each line)."""
return re.split(regex, s, flags=re.MULTILINE)
Now all that's left is RaceTime
and RaceTime.parse
. Ideally, we want a construction that's automatically sortable (dataclasses
and namedtuples
are). I'd recommend using a Decimal
(not float, they can't represent some numbers accurately) for number of seconds. This will be automatically sortable, easy to display, and easy to perform math with.
from decimal import Decimal
@dataclass(frozen=True)
class RaceTime:
elapsed_seconds: Decimal
def __str__(self):
seconds = self.elapsed_seconds % 60
minutes = self.elapsed_seconds // 60
return f'minutes:seconds' if minutes > 0 else f'seconds'
Parsing will give you a nice idea of why all this separation is good. Now that we have parsing just times out into a single location, we can test it alone. And that means it's much easier to make sure we handle things like NT
and the weird X
prefix in front of some times.
from decimal import Decimal
@dataclass(frozen=True)
class RaceTime:
elapsed_seconds: Decimal
@classmethod
def parse(cls, time_str):
if time_str == 'NT':
return NO_TIME
try:
mins, secs = time_str.split(':', 1)
except ValueError:
secs = time_str
mins = 0
return RaceTime(int(mins) * 60 + Decimal(secs))
# ...
class NoTime(RaceTime):
def __lt__(self, other):
return False
NO_TIME = NoTime()
Admittedly, the NO_TIME
is about the only of this that I don't feel the greatest about, but it's better than just using None
(because by that can't be sorted).
Now you can write some good unit tests for these small bits:
from unittest import TestCase
# from x import RaceTime, NO_TIME
class TestRaceTime(TestCase):
def test_parse(self):
self.assertEqual(Decimal('105.45') == RaceTime.parse('1:45.45').elapsed_time)
self.assertEqual(NO_TIME == RaceTime.parse('NT'))
That should about do it. Note how this code may not be shorter in terms of lines, but it is definitely shorter horizontally. Each "module" handles only one specific task and has defined API boundaries. Another advantage of these is that if later something were to fail in the script you'd get a fantastic traceback, something like "ValueError on line x in swimdata/heap.py" and the line would indicate the error occurred in Heat.parse()
so you'd know exactly where to look on the race page to see where the format changed.
In theory, everything above should "just work," but I haven't tested it. It's fairly simple though, which was another goal I had.
To motivate how you can use this to improve your code, here are the kinds of things I saw while reading your code that should be reflected in the example above:
- Use better names. What is
base_manager
? That doesn't tell me what it does.namei
?name_position
orname_index
may be better. These names are easier to understand for others reading your code (and maybe you later) - Avoid
global
. It is very hard to read code that uses (or worse mutates) global state, because then you have to scroll all over the place to see where data came from and what format it is in - As an aside to above, a lot of the places where you use globals you could've just passed as parameters (ex.
times_group()
called frombase_manager()
) - Separate IO from data processing (what if later you wanted to provide this script as a library for others to use? surely they wouldn't want it to spuriously print things)
- For input you can use a prompt string:
input('Your name: ')
- For errors, perhaps consider printing the error to stderr and returning an error code so your script is a bit easier to interface with from the CLI.
- Check out PEP8. Your formmating isn't too bad, but you have some long lines. PEP8 will force good formatting habits that hopefully encourage writing clearer code
- Usually we use single quotes for everything that isn't 3.6 format strings (ex.
" bar".format(foo)
) - You may want to use requests instead of urllib. In particular, it may not be safe to assume the content is UTF-8. Instead let the library decode it for you using the
Content-Type
. - Prefer
for
loops towhile
loops with index variables (ex.times_grouper()
) - You are doing a lot of work that you can leave to python (number parsing for example in
base_manager
)
Regexes may help make some of your parsing code a little cleaner- You make a lot of fragile assumptions about the data that you care about (number of digits, exact spacing). This is more likely to break than general assumptions about page format.
- There are lots of magic numbers in your code. Try to avoid them or use constants where they are needed to document their purpose. Especially for scraping, it is important to not rely on numbers like this because they are likely to change.
- Document what functions do (input, output, side effects) in
"""Doc comments."""
Write it from the perspective of someone wanting to use your function. What would they want to know. - Use
# inline comments
to explain certain lines that may not be immediately obvious (ex. see above how I explain why I slice off the first parts in the parsers)
That's about all I have time for, but hopefully that's helpful. If you'd like more pointers, I can make a gist with all of this nicely separated into modules and with examples of how to test it.
add a comment |Â
up vote
3
down vote
accepted
up vote
3
down vote
accepted
This is good code for a first stab. Let's dive in and see what can be improved.
I have one overarching recommendation. Scraping is a hard task because it incurs tech debt (in the form of if the source page format changes at all in a way that breaks one of your assumptions, you may have to completely rewrite parts of your code). It looks like this document is auto-generated, so its format is probably fairly stable. But I've made that assumption many times before and it's come back to bite me later.
I'd recommend separating concerns in your script. Right now you interweave all of your concerns (getting user input, downloading the page, parsing the page, extracting the desired information, and displaying output). Each of these concerns has its own error conditions and reasons to change, and either of those happening now likely requires a large rewrite on your part.
Instead, look to separate these concerns into "modules" (not necessarily Python modules, but perhaps functions or classes) with clear API boundaries. In this way, you can test them individually (integration/unit testing is especially important for the scraping component) and if any need to change, it will be an isolated change instead of a massive overhaul.
As a teaser, wouldn't it be nice if you main looked like this (forgive me if I miss some details from your script, another consequence about not separate concerns is that it's harder to grok code):
import sys
def main():
try:
standings = get_swimmer_standing(args.swimmer_name, args.race_url)
for i, swimmer in enumerate(standings):
print(f'i+1. swimmer.name swimmer.time')
except SwimmingDataError as e:
print(e, file=sys.stderr)
sys.exit(1)
Digging further let's see what a nice get_swimmer_standing
would look like:
from operator import attrgetter
def get_swimmer_standing(swimmer_name, race_url):
"""Returns the standings of the event in which swimmer_name participated.
Retrieves race information from race_url, locates the event in which
swimmer_name participated and returns a list of faster swimmers sorted
by time (includes the requested swimmer).
"""
race_page = RacePage.parse(get_race_page(race_url))
event, swimmer = find_swimmer_event(race_page, swimmer_name)
standings = list(sorted(event.swimmers, key=attrgetter('time')))
return standings[:standings.find(swimmer)+1]
We note that get_race_page
can do the urllib
stuff you did previously and should return the contents of the page or raise a subclass of SwimmerDataError
.
Let's then dig into find_swimmer_event
:
def find_swimmer_event(race_page, swimmer_name):
"""Returns the Swimmer with swimmer_name and Event they participated in on race_page."""
for event in race_page.events:
for swimmer in event.swimmers:
if swimmer.name == swimmer_name:
return event, swimmer
raise SwimmerNotFoundError(swimmer_name)
What's important to note here is that, so far, we've been dealing at the right levels of abstraction and we've clearly handled our error cases. "Modules" (functions up to this point) do one task and do it well (see Single Responsibility Principle).
Also note that I've been dreaming up this API as I go. At the point of writing this sentence, I haven't yet considered any of the details of the lower levels (ex. parsing of the page). Instead, I am deciding the best API for accessing the data the higher levels need and using that to inform my decisions of how to design lower levels.
Now that we'll start talking about scraping, let's first formulate the data classes we'll need. These are preferable to global variables like event_start
, event_end
, and own_time
because they keep related data together and give a uniform API for querying it.
Note that I'm using dataclasses here (which are Python 3.7), but you could use namedtuples on 3.6 (it's just the syntax for giving them methods is a little messy).
from typing import List
@dataclass(frozen=True)
class RacePage:
events: List[Event]
@dataclass(frozen=True)
class Event:
num: int
name: str
heats: List[Heat]
@property
def swimmers():
for heat in self.heats:
yield from heat.swimmers
@dataclass(frozen=True)
class Heat:
num: int
swimmers: List[Swimmer]
@dataclass(frozen=True)
class Swimmer:
lane: int
name: str
time: RaceTime
The above code alone would just work with the functions we defined above. The data classes can (and usually should be) for the most part dumb containers. Their power is in defining a nice API that will allow us to make other queries later should we need to (ex. "who is the youngest swimmer?").
Now for the actual parsing, we're again going to try to limit responsibility. RacePage.parse
should only be concerned with trimming off the header and separating the page into several strings beginning with "Event "
. It then delegates the parsing of each event to Event.parse
. That, in turn, is responsible for extracting the event's name, trimming the header, and splitting the remaining texts into strings beginning with "Heat "
. Then it defers to Heat.parse
. That should extract the heat number, trim the header, and split the lines, delegating to Swimmer.parse
. Do you see the pattern here?
Let's see just how simple these parsers can be if we continue to separate concerns and work at the right level of abstraction:
import re
@dataclass(frozen=True)
class RacePage:
events: List[Event]
@classmethod
def parse(cls, page_content):
# First part is the header before the first event
events = split_on(r'^Event d+ ', page_content)[1:]
return cls(list(map(Event.parse, events)))
@dataclass(frozen=True)
class Event:
num: int
name: str
heats: List[Heat]
@classmethod
def parse(cls, event_content):
# Parse first line for event number and name
# Event 123 Olympic Freestyle Trials
m = re.match(r'Event +(?P<num>d+) +(?<name>[^n]+)', event_content)
num = int(m.group('num'))
name = m.group('name').strip()
# The first part is the event header before the first heat
heats = split_on(r'Heat d+ ', event_content)[1:]
return cls(num, name, list(map(Heat.parse, heats)))
# ...
@dataclass(frozen=True)
class Heat:
num: int
swimmers: List[Swimmer]
@classmethod
def parse(cls, heat_content):
# Parse heat number from the header
# Heat 1 of 10 Timed Finals Starts at 05:29 PM
m = re.match(r'Heat (?<num>d+) ', heat_content)
num = int(m.group('num'))
# The first line is the heat header before the first swimmer
swimmers = heat_content.split('n')[1:]
return cls(num, list(map(Swimmer.parse, swimmers)))
@dataclass(frozen=True)
class Swimmer:
lane: int
name: str
time: RaceTime
@classmethod
def parse(cls, swimmer_line):
# Parses lane number, name, and time from heat line
# 5 Michael Phelps 28 USA 50.58
# Note: assumes names don't have numbers in them
m = re.match(r' *(?<lane>d+) +(?P<name>D+) +(?<rest>.+)', swimmer_line)
lane = int(m.group('lane'))
name = m.group('name')
rest = m.group('rest')
time = RaceTime.parse(rest.split()[-1])
return cls(lane, name, time)
Note that these parse methods are very simple. I've used regexes to keep things clean. Also note that I collect more information than you need (which you can choose to omit). A particular strategy that's important for longevity is to be extremely forgiving about formatting. So for example, since all we need for Simmer
is the first number and name and the last time, we are careful to not make any assumptions about what is in between. In particular, once we pull off the lane number and name, with the rest we do rest.split()[-1]
to get the last segment without having to care about anything before it. This allows the middle to change formats entirely as long as the lane number, name, and time remain in their expected positions.
Notice the use of split_on
. This is a helper that I've yet to write, but identified as a common task done in a few places. It's better to pull this out into a small helper function (that can be tested on its own). While I was writing the parsers, I didn't concern myself with how this function would work. Now that I am, I recognize that this is nearly a function provided by python already:
import re
def split_on(regex, s):
"""Split a string every time regex is matched (at the beginning of each line)."""
return re.split(regex, s, flags=re.MULTILINE)
Now all that's left is RaceTime
and RaceTime.parse
. Ideally, we want a construction that's automatically sortable (dataclasses
and namedtuples
are). I'd recommend using a Decimal
(not float, they can't represent some numbers accurately) for number of seconds. This will be automatically sortable, easy to display, and easy to perform math with.
from decimal import Decimal
@dataclass(frozen=True)
class RaceTime:
elapsed_seconds: Decimal
def __str__(self):
seconds = self.elapsed_seconds % 60
minutes = self.elapsed_seconds // 60
return f'minutes:seconds' if minutes > 0 else f'seconds'
Parsing will give you a nice idea of why all this separation is good. Now that we have parsing just times out into a single location, we can test it alone. And that means it's much easier to make sure we handle things like NT
and the weird X
prefix in front of some times.
from decimal import Decimal
@dataclass(frozen=True)
class RaceTime:
elapsed_seconds: Decimal
@classmethod
def parse(cls, time_str):
if time_str == 'NT':
return NO_TIME
try:
mins, secs = time_str.split(':', 1)
except ValueError:
secs = time_str
mins = 0
return RaceTime(int(mins) * 60 + Decimal(secs))
# ...
class NoTime(RaceTime):
def __lt__(self, other):
return False
NO_TIME = NoTime()
Admittedly, the NO_TIME
is about the only of this that I don't feel the greatest about, but it's better than just using None
(because by that can't be sorted).
Now you can write some good unit tests for these small bits:
from unittest import TestCase
# from x import RaceTime, NO_TIME
class TestRaceTime(TestCase):
def test_parse(self):
self.assertEqual(Decimal('105.45') == RaceTime.parse('1:45.45').elapsed_time)
self.assertEqual(NO_TIME == RaceTime.parse('NT'))
That should about do it. Note how this code may not be shorter in terms of lines, but it is definitely shorter horizontally. Each "module" handles only one specific task and has defined API boundaries. Another advantage of these is that if later something were to fail in the script you'd get a fantastic traceback, something like "ValueError on line x in swimdata/heap.py" and the line would indicate the error occurred in Heat.parse()
so you'd know exactly where to look on the race page to see where the format changed.
In theory, everything above should "just work," but I haven't tested it. It's fairly simple though, which was another goal I had.
To motivate how you can use this to improve your code, here are the kinds of things I saw while reading your code that should be reflected in the example above:
- Use better names. What is
base_manager
? That doesn't tell me what it does.namei
?name_position
orname_index
may be better. These names are easier to understand for others reading your code (and maybe you later) - Avoid
global
. It is very hard to read code that uses (or worse mutates) global state, because then you have to scroll all over the place to see where data came from and what format it is in - As an aside to above, a lot of the places where you use globals you could've just passed as parameters (ex.
times_group()
called frombase_manager()
) - Separate IO from data processing (what if later you wanted to provide this script as a library for others to use? surely they wouldn't want it to spuriously print things)
- For input you can use a prompt string:
input('Your name: ')
- For errors, perhaps consider printing the error to stderr and returning an error code so your script is a bit easier to interface with from the CLI.
- Check out PEP8. Your formmating isn't too bad, but you have some long lines. PEP8 will force good formatting habits that hopefully encourage writing clearer code
- Usually we use single quotes for everything that isn't 3.6 format strings (ex.
" bar".format(foo)
) - You may want to use requests instead of urllib. In particular, it may not be safe to assume the content is UTF-8. Instead let the library decode it for you using the
Content-Type
. - Prefer
for
loops towhile
loops with index variables (ex.times_grouper()
) - You are doing a lot of work that you can leave to python (number parsing for example in
base_manager
)
Regexes may help make some of your parsing code a little cleaner- You make a lot of fragile assumptions about the data that you care about (number of digits, exact spacing). This is more likely to break than general assumptions about page format.
- There are lots of magic numbers in your code. Try to avoid them or use constants where they are needed to document their purpose. Especially for scraping, it is important to not rely on numbers like this because they are likely to change.
- Document what functions do (input, output, side effects) in
"""Doc comments."""
Write it from the perspective of someone wanting to use your function. What would they want to know. - Use
# inline comments
to explain certain lines that may not be immediately obvious (ex. see above how I explain why I slice off the first parts in the parsers)
That's about all I have time for, but hopefully that's helpful. If you'd like more pointers, I can make a gist with all of this nicely separated into modules and with examples of how to test it.
This is good code for a first stab. Let's dive in and see what can be improved.
I have one overarching recommendation. Scraping is a hard task because it incurs tech debt (in the form of if the source page format changes at all in a way that breaks one of your assumptions, you may have to completely rewrite parts of your code). It looks like this document is auto-generated, so its format is probably fairly stable. But I've made that assumption many times before and it's come back to bite me later.
I'd recommend separating concerns in your script. Right now you interweave all of your concerns (getting user input, downloading the page, parsing the page, extracting the desired information, and displaying output). Each of these concerns has its own error conditions and reasons to change, and either of those happening now likely requires a large rewrite on your part.
Instead, look to separate these concerns into "modules" (not necessarily Python modules, but perhaps functions or classes) with clear API boundaries. In this way, you can test them individually (integration/unit testing is especially important for the scraping component) and if any need to change, it will be an isolated change instead of a massive overhaul.
As a teaser, wouldn't it be nice if you main looked like this (forgive me if I miss some details from your script, another consequence about not separate concerns is that it's harder to grok code):
import sys
def main():
try:
standings = get_swimmer_standing(args.swimmer_name, args.race_url)
for i, swimmer in enumerate(standings):
print(f'i+1. swimmer.name swimmer.time')
except SwimmingDataError as e:
print(e, file=sys.stderr)
sys.exit(1)
Digging further let's see what a nice get_swimmer_standing
would look like:
from operator import attrgetter
def get_swimmer_standing(swimmer_name, race_url):
"""Returns the standings of the event in which swimmer_name participated.
Retrieves race information from race_url, locates the event in which
swimmer_name participated and returns a list of faster swimmers sorted
by time (includes the requested swimmer).
"""
race_page = RacePage.parse(get_race_page(race_url))
event, swimmer = find_swimmer_event(race_page, swimmer_name)
standings = list(sorted(event.swimmers, key=attrgetter('time')))
return standings[:standings.find(swimmer)+1]
We note that get_race_page
can do the urllib
stuff you did previously and should return the contents of the page or raise a subclass of SwimmerDataError
.
Let's then dig into find_swimmer_event
:
def find_swimmer_event(race_page, swimmer_name):
"""Returns the Swimmer with swimmer_name and Event they participated in on race_page."""
for event in race_page.events:
for swimmer in event.swimmers:
if swimmer.name == swimmer_name:
return event, swimmer
raise SwimmerNotFoundError(swimmer_name)
What's important to note here is that, so far, we've been dealing at the right levels of abstraction and we've clearly handled our error cases. "Modules" (functions up to this point) do one task and do it well (see Single Responsibility Principle).
Also note that I've been dreaming up this API as I go. At the point of writing this sentence, I haven't yet considered any of the details of the lower levels (ex. parsing of the page). Instead, I am deciding the best API for accessing the data the higher levels need and using that to inform my decisions of how to design lower levels.
Now that we'll start talking about scraping, let's first formulate the data classes we'll need. These are preferable to global variables like event_start
, event_end
, and own_time
because they keep related data together and give a uniform API for querying it.
Note that I'm using dataclasses here (which are Python 3.7), but you could use namedtuples on 3.6 (it's just the syntax for giving them methods is a little messy).
from typing import List
@dataclass(frozen=True)
class RacePage:
events: List[Event]
@dataclass(frozen=True)
class Event:
num: int
name: str
heats: List[Heat]
@property
def swimmers():
for heat in self.heats:
yield from heat.swimmers
@dataclass(frozen=True)
class Heat:
num: int
swimmers: List[Swimmer]
@dataclass(frozen=True)
class Swimmer:
lane: int
name: str
time: RaceTime
The above code alone would just work with the functions we defined above. The data classes can (and usually should be) for the most part dumb containers. Their power is in defining a nice API that will allow us to make other queries later should we need to (ex. "who is the youngest swimmer?").
Now for the actual parsing, we're again going to try to limit responsibility. RacePage.parse
should only be concerned with trimming off the header and separating the page into several strings beginning with "Event "
. It then delegates the parsing of each event to Event.parse
. That, in turn, is responsible for extracting the event's name, trimming the header, and splitting the remaining texts into strings beginning with "Heat "
. Then it defers to Heat.parse
. That should extract the heat number, trim the header, and split the lines, delegating to Swimmer.parse
. Do you see the pattern here?
Let's see just how simple these parsers can be if we continue to separate concerns and work at the right level of abstraction:
import re
@dataclass(frozen=True)
class RacePage:
events: List[Event]
@classmethod
def parse(cls, page_content):
# First part is the header before the first event
events = split_on(r'^Event d+ ', page_content)[1:]
return cls(list(map(Event.parse, events)))
@dataclass(frozen=True)
class Event:
num: int
name: str
heats: List[Heat]
@classmethod
def parse(cls, event_content):
# Parse first line for event number and name
# Event 123 Olympic Freestyle Trials
m = re.match(r'Event +(?P<num>d+) +(?<name>[^n]+)', event_content)
num = int(m.group('num'))
name = m.group('name').strip()
# The first part is the event header before the first heat
heats = split_on(r'Heat d+ ', event_content)[1:]
return cls(num, name, list(map(Heat.parse, heats)))
# ...
@dataclass(frozen=True)
class Heat:
num: int
swimmers: List[Swimmer]
@classmethod
def parse(cls, heat_content):
# Parse heat number from the header
# Heat 1 of 10 Timed Finals Starts at 05:29 PM
m = re.match(r'Heat (?<num>d+) ', heat_content)
num = int(m.group('num'))
# The first line is the heat header before the first swimmer
swimmers = heat_content.split('n')[1:]
return cls(num, list(map(Swimmer.parse, swimmers)))
@dataclass(frozen=True)
class Swimmer:
lane: int
name: str
time: RaceTime
@classmethod
def parse(cls, swimmer_line):
# Parses lane number, name, and time from heat line
# 5 Michael Phelps 28 USA 50.58
# Note: assumes names don't have numbers in them
m = re.match(r' *(?<lane>d+) +(?P<name>D+) +(?<rest>.+)', swimmer_line)
lane = int(m.group('lane'))
name = m.group('name')
rest = m.group('rest')
time = RaceTime.parse(rest.split()[-1])
return cls(lane, name, time)
Note that these parse methods are very simple. I've used regexes to keep things clean. Also note that I collect more information than you need (which you can choose to omit). A particular strategy that's important for longevity is to be extremely forgiving about formatting. So for example, since all we need for Simmer
is the first number and name and the last time, we are careful to not make any assumptions about what is in between. In particular, once we pull off the lane number and name, with the rest we do rest.split()[-1]
to get the last segment without having to care about anything before it. This allows the middle to change formats entirely as long as the lane number, name, and time remain in their expected positions.
Notice the use of split_on
. This is a helper that I've yet to write, but identified as a common task done in a few places. It's better to pull this out into a small helper function (that can be tested on its own). While I was writing the parsers, I didn't concern myself with how this function would work. Now that I am, I recognize that this is nearly a function provided by python already:
import re
def split_on(regex, s):
"""Split a string every time regex is matched (at the beginning of each line)."""
return re.split(regex, s, flags=re.MULTILINE)
Now all that's left is RaceTime
and RaceTime.parse
. Ideally, we want a construction that's automatically sortable (dataclasses
and namedtuples
are). I'd recommend using a Decimal
(not float, they can't represent some numbers accurately) for number of seconds. This will be automatically sortable, easy to display, and easy to perform math with.
from decimal import Decimal
@dataclass(frozen=True)
class RaceTime:
elapsed_seconds: Decimal
def __str__(self):
seconds = self.elapsed_seconds % 60
minutes = self.elapsed_seconds // 60
return f'minutes:seconds' if minutes > 0 else f'seconds'
Parsing will give you a nice idea of why all this separation is good. Now that we have parsing just times out into a single location, we can test it alone. And that means it's much easier to make sure we handle things like NT
and the weird X
prefix in front of some times.
from decimal import Decimal
@dataclass(frozen=True)
class RaceTime:
elapsed_seconds: Decimal
@classmethod
def parse(cls, time_str):
if time_str == 'NT':
return NO_TIME
try:
mins, secs = time_str.split(':', 1)
except ValueError:
secs = time_str
mins = 0
return RaceTime(int(mins) * 60 + Decimal(secs))
# ...
class NoTime(RaceTime):
def __lt__(self, other):
return False
NO_TIME = NoTime()
Admittedly, the NO_TIME
is about the only of this that I don't feel the greatest about, but it's better than just using None
(because by that can't be sorted).
Now you can write some good unit tests for these small bits:
from unittest import TestCase
# from x import RaceTime, NO_TIME
class TestRaceTime(TestCase):
def test_parse(self):
self.assertEqual(Decimal('105.45') == RaceTime.parse('1:45.45').elapsed_time)
self.assertEqual(NO_TIME == RaceTime.parse('NT'))
That should about do it. Note how this code may not be shorter in terms of lines, but it is definitely shorter horizontally. Each "module" handles only one specific task and has defined API boundaries. Another advantage of these is that if later something were to fail in the script you'd get a fantastic traceback, something like "ValueError on line x in swimdata/heap.py" and the line would indicate the error occurred in Heat.parse()
so you'd know exactly where to look on the race page to see where the format changed.
In theory, everything above should "just work," but I haven't tested it. It's fairly simple though, which was another goal I had.
To motivate how you can use this to improve your code, here are the kinds of things I saw while reading your code that should be reflected in the example above:
- Use better names. What is
base_manager
? That doesn't tell me what it does.namei
?name_position
orname_index
may be better. These names are easier to understand for others reading your code (and maybe you later) - Avoid
global
. It is very hard to read code that uses (or worse mutates) global state, because then you have to scroll all over the place to see where data came from and what format it is in - As an aside to above, a lot of the places where you use globals you could've just passed as parameters (ex.
times_group()
called frombase_manager()
) - Separate IO from data processing (what if later you wanted to provide this script as a library for others to use? surely they wouldn't want it to spuriously print things)
- For input you can use a prompt string:
input('Your name: ')
- For errors, perhaps consider printing the error to stderr and returning an error code so your script is a bit easier to interface with from the CLI.
- Check out PEP8. Your formmating isn't too bad, but you have some long lines. PEP8 will force good formatting habits that hopefully encourage writing clearer code
- Usually we use single quotes for everything that isn't 3.6 format strings (ex.
" bar".format(foo)
) - You may want to use requests instead of urllib. In particular, it may not be safe to assume the content is UTF-8. Instead let the library decode it for you using the
Content-Type
. - Prefer
for
loops towhile
loops with index variables (ex.times_grouper()
) - You are doing a lot of work that you can leave to python (number parsing for example in
base_manager
)
Regexes may help make some of your parsing code a little cleaner- You make a lot of fragile assumptions about the data that you care about (number of digits, exact spacing). This is more likely to break than general assumptions about page format.
- There are lots of magic numbers in your code. Try to avoid them or use constants where they are needed to document their purpose. Especially for scraping, it is important to not rely on numbers like this because they are likely to change.
- Document what functions do (input, output, side effects) in
"""Doc comments."""
Write it from the perspective of someone wanting to use your function. What would they want to know. - Use
# inline comments
to explain certain lines that may not be immediately obvious (ex. see above how I explain why I slice off the first parts in the parsers)
That's about all I have time for, but hopefully that's helpful. If you'd like more pointers, I can make a gist with all of this nicely separated into modules and with examples of how to test it.
answered Jul 13 at 22:03
Bailey Parker
1,161710
1,161710
add a comment |Â
add a comment |Â
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f198447%2ffinding-a-swimmers-competing-position-from-a-website%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password