Construct and send an HTTP get request from scratch and print all the received data to the screen
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
3
down vote
favorite
My concerns:
Is the code pythonic?
Does the receive_all
function make the code any cleaner or should that functionally be apart of the get
function?
import socket
import urlparse
CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
socket.setdefaulttimeout = CONNECTION_TIMEOUT
CRLF = "rnrn"
def receive_all(sock, chunk_size=CHUNK_SIZE):
'''
Gather all the data from a request.
'''
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break
return repr(''.join(chunks))
def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
url = urlparse.urlparse(url)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.connect((url.netloc, url.port or 80))
sock.send('GET 0 HTTP/1.0 1'.format(url.path or '/', CRLF))
data = receive_all(sock)
sock.shutdown(1)
sock.close()
return data
print(get('http://www.google.com/robots.txt', chunk_size=65535))
python reinventing-the-wheel http socket
add a comment |Â
up vote
3
down vote
favorite
My concerns:
Is the code pythonic?
Does the receive_all
function make the code any cleaner or should that functionally be apart of the get
function?
import socket
import urlparse
CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
socket.setdefaulttimeout = CONNECTION_TIMEOUT
CRLF = "rnrn"
def receive_all(sock, chunk_size=CHUNK_SIZE):
'''
Gather all the data from a request.
'''
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break
return repr(''.join(chunks))
def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
url = urlparse.urlparse(url)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.connect((url.netloc, url.port or 80))
sock.send('GET 0 HTTP/1.0 1'.format(url.path or '/', CRLF))
data = receive_all(sock)
sock.shutdown(1)
sock.close()
return data
print(get('http://www.google.com/robots.txt', chunk_size=65535))
python reinventing-the-wheel http socket
socket.setdefaulttimeout()
is a function. Are you sure you didn't mean to call it withCONNECTION_TIMEOUT
as argument?
â Daniel
Jun 8 at 17:56
@Coal_ Thanks for pointing out that bug
â Ricky Wilson
Jun 8 at 18:01
Does this code work correctly?
â Phrancis
Jun 8 at 18:11
@Phrancis Just tested it, works fine for Python 2.
â Daniel
Jun 8 at 18:14
add a comment |Â
up vote
3
down vote
favorite
up vote
3
down vote
favorite
My concerns:
Is the code pythonic?
Does the receive_all
function make the code any cleaner or should that functionally be apart of the get
function?
import socket
import urlparse
CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
socket.setdefaulttimeout = CONNECTION_TIMEOUT
CRLF = "rnrn"
def receive_all(sock, chunk_size=CHUNK_SIZE):
'''
Gather all the data from a request.
'''
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break
return repr(''.join(chunks))
def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
url = urlparse.urlparse(url)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.connect((url.netloc, url.port or 80))
sock.send('GET 0 HTTP/1.0 1'.format(url.path or '/', CRLF))
data = receive_all(sock)
sock.shutdown(1)
sock.close()
return data
print(get('http://www.google.com/robots.txt', chunk_size=65535))
python reinventing-the-wheel http socket
My concerns:
Is the code pythonic?
Does the receive_all
function make the code any cleaner or should that functionally be apart of the get
function?
import socket
import urlparse
CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
socket.setdefaulttimeout = CONNECTION_TIMEOUT
CRLF = "rnrn"
def receive_all(sock, chunk_size=CHUNK_SIZE):
'''
Gather all the data from a request.
'''
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break
return repr(''.join(chunks))
def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
url = urlparse.urlparse(url)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.connect((url.netloc, url.port or 80))
sock.send('GET 0 HTTP/1.0 1'.format(url.path or '/', CRLF))
data = receive_all(sock)
sock.shutdown(1)
sock.close()
return data
print(get('http://www.google.com/robots.txt', chunk_size=65535))
python reinventing-the-wheel http socket
edited Jun 9 at 7:44
Mathias Ettinger
21.8k32875
21.8k32875
asked Jun 8 at 17:51
Ricky Wilson
820915
820915
socket.setdefaulttimeout()
is a function. Are you sure you didn't mean to call it withCONNECTION_TIMEOUT
as argument?
â Daniel
Jun 8 at 17:56
@Coal_ Thanks for pointing out that bug
â Ricky Wilson
Jun 8 at 18:01
Does this code work correctly?
â Phrancis
Jun 8 at 18:11
@Phrancis Just tested it, works fine for Python 2.
â Daniel
Jun 8 at 18:14
add a comment |Â
socket.setdefaulttimeout()
is a function. Are you sure you didn't mean to call it withCONNECTION_TIMEOUT
as argument?
â Daniel
Jun 8 at 17:56
@Coal_ Thanks for pointing out that bug
â Ricky Wilson
Jun 8 at 18:01
Does this code work correctly?
â Phrancis
Jun 8 at 18:11
@Phrancis Just tested it, works fine for Python 2.
â Daniel
Jun 8 at 18:14
socket.setdefaulttimeout()
is a function. Are you sure you didn't mean to call it with CONNECTION_TIMEOUT
as argument?â Daniel
Jun 8 at 17:56
socket.setdefaulttimeout()
is a function. Are you sure you didn't mean to call it with CONNECTION_TIMEOUT
as argument?â Daniel
Jun 8 at 17:56
@Coal_ Thanks for pointing out that bug
â Ricky Wilson
Jun 8 at 18:01
@Coal_ Thanks for pointing out that bug
â Ricky Wilson
Jun 8 at 18:01
Does this code work correctly?
â Phrancis
Jun 8 at 18:11
Does this code work correctly?
â Phrancis
Jun 8 at 18:11
@Phrancis Just tested it, works fine for Python 2.
â Daniel
Jun 8 at 18:14
@Phrancis Just tested it, works fine for Python 2.
â Daniel
Jun 8 at 18:14
add a comment |Â
1 Answer
1
active
oldest
votes
up vote
3
down vote
Review
Is the code pythonic?
The code looks quite clean to me, but it never gets pythonic enough ;)
Some PEP-8 issues:
Don't mix single and double quotes;1
Leave two blank lines between the constants and the first function / class definition.2
Don't use
repr()
:repr()
is meant to get a printable (human readable)
version of an object. In this case, you don't even need to modify the chunks in any way, because they are already of the correct type (str
).You should consider a longer connection timeout. I find a second to be acceptable, but this of course depends on your use case.
There's usually no reason to set
socket.SO_REUSEADDR
. Regardless, doing so only has an effect if you bind the socket to an address after having done so.socket.socket.send()
does not guarantee that all of what you passed it was sent, instead, it returns the number of bytes sent. You may want to put that part in a loop to assure all data was sent. Or considersocket.socket.sendall()
.If you choose to explicitly call
socket.socket.shutdown()
, make sure you get the argument right.1
corresponds tosocket.SHUT_WR
, but you need to prevent reading and writing. Use module constants!socket.SHUT_RDWR
is the answer.I see you added a docstring for
receive_all()
. That's a nice start, but the docstring is a little generic. A more elaborate description of the arguments and return type and value would be helpful. The same goes forget()
. If this were a library,get()
would be part of the API, so documenting it should be a priority.Make sure the documentation is accurate. 'Gather all the data from a request' makes it sound like the socket already made a request and the function merely modifies the response in some way and then returns it. That may give people the false impression that the function has no side effects, but it does!
Let's say you extend this piece of code, and add more helper functions. If one of them raises an exception, the socket is never closed, and you leave the server hanging. Luckily, Python sockets provide context manager capabilities out of the box. That means we can make sure resources are cleaned up:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.settimeout(timeout)
...
return dataTurns out this has only been a feature since Python 3.2. One of the many reasons to switch to Python 3. Are you being forced to work with Python 2? A
try
/finally
is probably the closest we can get:try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
...
finally:
sock.shutdown(socket.SHUT_RDWR)
sock.close()
Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?
It's a good idea to extract functionality and put it in its own function, so yes, it does make the code cleaner. Should you be a big fan of OOP, a custom socket class extending from socket.socket
, with a receive_all()
method, would be the solution.
My take on it
import socket
import urlparse
CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
CRLF = "rnrn"
socket.setdefaulttimeout(CONNECTION_TIMEOUT)
def receive_all(sock, chunk_size=CHUNK_SIZE):
"""Receive all data from a socket, until EOF is reached.
Arguments:
- sock (socket.socket): The socket to receive from. This socket must
be in a valid state, where it can receive data.
- chunk_size (int) = 1024: The size of the chunks to receive, in
bytes. Commonly a multiple of 2 (1024, 2048, 4096). When in doubt,
do not change.
Return:
- (str) All the data received.
"""
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break
return "".join(chunks)
def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
"""Get the HTTP response from a webserver.
Arguments:
- url (str): A URL (optionally with a scheme attached).
- timeout (int / float) = 0.3: The number of seconds to wait before
timing out. May also be less than 1; a timeout of 0.5 corresponds to
half a second.
- chunk_size (int) = 1024: The size of the chunks to receive, in bytes.
Commonly a multiple of 2 (1024, 2048, 4096). When in doubt, do not
change.
Return:
- (str): The HTTP response.
"""
url = urlparse.urlparse(url)
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.connect((url.netloc, url.port or 80))
sock.sendall("GET 0 HTTP/1.0 1".format(url.path or "/", CRLF))
data = receive_all(sock)
finally:
sock.shutdown(socket.SHUT_RDWR)
sock.close()
return data
if __name__ == "__main__":
print(get("http://www.google.com/robots.txt", chunk_size=65535))
References
1 PEP-8: String Quotes
2 PEP-8: Blank Lines
add a comment |Â
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
3
down vote
Review
Is the code pythonic?
The code looks quite clean to me, but it never gets pythonic enough ;)
Some PEP-8 issues:
Don't mix single and double quotes;1
Leave two blank lines between the constants and the first function / class definition.2
Don't use
repr()
:repr()
is meant to get a printable (human readable)
version of an object. In this case, you don't even need to modify the chunks in any way, because they are already of the correct type (str
).You should consider a longer connection timeout. I find a second to be acceptable, but this of course depends on your use case.
There's usually no reason to set
socket.SO_REUSEADDR
. Regardless, doing so only has an effect if you bind the socket to an address after having done so.socket.socket.send()
does not guarantee that all of what you passed it was sent, instead, it returns the number of bytes sent. You may want to put that part in a loop to assure all data was sent. Or considersocket.socket.sendall()
.If you choose to explicitly call
socket.socket.shutdown()
, make sure you get the argument right.1
corresponds tosocket.SHUT_WR
, but you need to prevent reading and writing. Use module constants!socket.SHUT_RDWR
is the answer.I see you added a docstring for
receive_all()
. That's a nice start, but the docstring is a little generic. A more elaborate description of the arguments and return type and value would be helpful. The same goes forget()
. If this were a library,get()
would be part of the API, so documenting it should be a priority.Make sure the documentation is accurate. 'Gather all the data from a request' makes it sound like the socket already made a request and the function merely modifies the response in some way and then returns it. That may give people the false impression that the function has no side effects, but it does!
Let's say you extend this piece of code, and add more helper functions. If one of them raises an exception, the socket is never closed, and you leave the server hanging. Luckily, Python sockets provide context manager capabilities out of the box. That means we can make sure resources are cleaned up:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.settimeout(timeout)
...
return dataTurns out this has only been a feature since Python 3.2. One of the many reasons to switch to Python 3. Are you being forced to work with Python 2? A
try
/finally
is probably the closest we can get:try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
...
finally:
sock.shutdown(socket.SHUT_RDWR)
sock.close()
Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?
It's a good idea to extract functionality and put it in its own function, so yes, it does make the code cleaner. Should you be a big fan of OOP, a custom socket class extending from socket.socket
, with a receive_all()
method, would be the solution.
My take on it
import socket
import urlparse
CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
CRLF = "rnrn"
socket.setdefaulttimeout(CONNECTION_TIMEOUT)
def receive_all(sock, chunk_size=CHUNK_SIZE):
"""Receive all data from a socket, until EOF is reached.
Arguments:
- sock (socket.socket): The socket to receive from. This socket must
be in a valid state, where it can receive data.
- chunk_size (int) = 1024: The size of the chunks to receive, in
bytes. Commonly a multiple of 2 (1024, 2048, 4096). When in doubt,
do not change.
Return:
- (str) All the data received.
"""
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break
return "".join(chunks)
def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
"""Get the HTTP response from a webserver.
Arguments:
- url (str): A URL (optionally with a scheme attached).
- timeout (int / float) = 0.3: The number of seconds to wait before
timing out. May also be less than 1; a timeout of 0.5 corresponds to
half a second.
- chunk_size (int) = 1024: The size of the chunks to receive, in bytes.
Commonly a multiple of 2 (1024, 2048, 4096). When in doubt, do not
change.
Return:
- (str): The HTTP response.
"""
url = urlparse.urlparse(url)
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.connect((url.netloc, url.port or 80))
sock.sendall("GET 0 HTTP/1.0 1".format(url.path or "/", CRLF))
data = receive_all(sock)
finally:
sock.shutdown(socket.SHUT_RDWR)
sock.close()
return data
if __name__ == "__main__":
print(get("http://www.google.com/robots.txt", chunk_size=65535))
References
1 PEP-8: String Quotes
2 PEP-8: Blank Lines
add a comment |Â
up vote
3
down vote
Review
Is the code pythonic?
The code looks quite clean to me, but it never gets pythonic enough ;)
Some PEP-8 issues:
Don't mix single and double quotes;1
Leave two blank lines between the constants and the first function / class definition.2
Don't use
repr()
:repr()
is meant to get a printable (human readable)
version of an object. In this case, you don't even need to modify the chunks in any way, because they are already of the correct type (str
).You should consider a longer connection timeout. I find a second to be acceptable, but this of course depends on your use case.
There's usually no reason to set
socket.SO_REUSEADDR
. Regardless, doing so only has an effect if you bind the socket to an address after having done so.socket.socket.send()
does not guarantee that all of what you passed it was sent, instead, it returns the number of bytes sent. You may want to put that part in a loop to assure all data was sent. Or considersocket.socket.sendall()
.If you choose to explicitly call
socket.socket.shutdown()
, make sure you get the argument right.1
corresponds tosocket.SHUT_WR
, but you need to prevent reading and writing. Use module constants!socket.SHUT_RDWR
is the answer.I see you added a docstring for
receive_all()
. That's a nice start, but the docstring is a little generic. A more elaborate description of the arguments and return type and value would be helpful. The same goes forget()
. If this were a library,get()
would be part of the API, so documenting it should be a priority.Make sure the documentation is accurate. 'Gather all the data from a request' makes it sound like the socket already made a request and the function merely modifies the response in some way and then returns it. That may give people the false impression that the function has no side effects, but it does!
Let's say you extend this piece of code, and add more helper functions. If one of them raises an exception, the socket is never closed, and you leave the server hanging. Luckily, Python sockets provide context manager capabilities out of the box. That means we can make sure resources are cleaned up:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.settimeout(timeout)
...
return dataTurns out this has only been a feature since Python 3.2. One of the many reasons to switch to Python 3. Are you being forced to work with Python 2? A
try
/finally
is probably the closest we can get:try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
...
finally:
sock.shutdown(socket.SHUT_RDWR)
sock.close()
Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?
It's a good idea to extract functionality and put it in its own function, so yes, it does make the code cleaner. Should you be a big fan of OOP, a custom socket class extending from socket.socket
, with a receive_all()
method, would be the solution.
My take on it
import socket
import urlparse
CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
CRLF = "rnrn"
socket.setdefaulttimeout(CONNECTION_TIMEOUT)
def receive_all(sock, chunk_size=CHUNK_SIZE):
"""Receive all data from a socket, until EOF is reached.
Arguments:
- sock (socket.socket): The socket to receive from. This socket must
be in a valid state, where it can receive data.
- chunk_size (int) = 1024: The size of the chunks to receive, in
bytes. Commonly a multiple of 2 (1024, 2048, 4096). When in doubt,
do not change.
Return:
- (str) All the data received.
"""
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break
return "".join(chunks)
def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
"""Get the HTTP response from a webserver.
Arguments:
- url (str): A URL (optionally with a scheme attached).
- timeout (int / float) = 0.3: The number of seconds to wait before
timing out. May also be less than 1; a timeout of 0.5 corresponds to
half a second.
- chunk_size (int) = 1024: The size of the chunks to receive, in bytes.
Commonly a multiple of 2 (1024, 2048, 4096). When in doubt, do not
change.
Return:
- (str): The HTTP response.
"""
url = urlparse.urlparse(url)
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.connect((url.netloc, url.port or 80))
sock.sendall("GET 0 HTTP/1.0 1".format(url.path or "/", CRLF))
data = receive_all(sock)
finally:
sock.shutdown(socket.SHUT_RDWR)
sock.close()
return data
if __name__ == "__main__":
print(get("http://www.google.com/robots.txt", chunk_size=65535))
References
1 PEP-8: String Quotes
2 PEP-8: Blank Lines
add a comment |Â
up vote
3
down vote
up vote
3
down vote
Review
Is the code pythonic?
The code looks quite clean to me, but it never gets pythonic enough ;)
Some PEP-8 issues:
Don't mix single and double quotes;1
Leave two blank lines between the constants and the first function / class definition.2
Don't use
repr()
:repr()
is meant to get a printable (human readable)
version of an object. In this case, you don't even need to modify the chunks in any way, because they are already of the correct type (str
).You should consider a longer connection timeout. I find a second to be acceptable, but this of course depends on your use case.
There's usually no reason to set
socket.SO_REUSEADDR
. Regardless, doing so only has an effect if you bind the socket to an address after having done so.socket.socket.send()
does not guarantee that all of what you passed it was sent, instead, it returns the number of bytes sent. You may want to put that part in a loop to assure all data was sent. Or considersocket.socket.sendall()
.If you choose to explicitly call
socket.socket.shutdown()
, make sure you get the argument right.1
corresponds tosocket.SHUT_WR
, but you need to prevent reading and writing. Use module constants!socket.SHUT_RDWR
is the answer.I see you added a docstring for
receive_all()
. That's a nice start, but the docstring is a little generic. A more elaborate description of the arguments and return type and value would be helpful. The same goes forget()
. If this were a library,get()
would be part of the API, so documenting it should be a priority.Make sure the documentation is accurate. 'Gather all the data from a request' makes it sound like the socket already made a request and the function merely modifies the response in some way and then returns it. That may give people the false impression that the function has no side effects, but it does!
Let's say you extend this piece of code, and add more helper functions. If one of them raises an exception, the socket is never closed, and you leave the server hanging. Luckily, Python sockets provide context manager capabilities out of the box. That means we can make sure resources are cleaned up:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.settimeout(timeout)
...
return dataTurns out this has only been a feature since Python 3.2. One of the many reasons to switch to Python 3. Are you being forced to work with Python 2? A
try
/finally
is probably the closest we can get:try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
...
finally:
sock.shutdown(socket.SHUT_RDWR)
sock.close()
Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?
It's a good idea to extract functionality and put it in its own function, so yes, it does make the code cleaner. Should you be a big fan of OOP, a custom socket class extending from socket.socket
, with a receive_all()
method, would be the solution.
My take on it
import socket
import urlparse
CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
CRLF = "rnrn"
socket.setdefaulttimeout(CONNECTION_TIMEOUT)
def receive_all(sock, chunk_size=CHUNK_SIZE):
"""Receive all data from a socket, until EOF is reached.
Arguments:
- sock (socket.socket): The socket to receive from. This socket must
be in a valid state, where it can receive data.
- chunk_size (int) = 1024: The size of the chunks to receive, in
bytes. Commonly a multiple of 2 (1024, 2048, 4096). When in doubt,
do not change.
Return:
- (str) All the data received.
"""
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break
return "".join(chunks)
def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
"""Get the HTTP response from a webserver.
Arguments:
- url (str): A URL (optionally with a scheme attached).
- timeout (int / float) = 0.3: The number of seconds to wait before
timing out. May also be less than 1; a timeout of 0.5 corresponds to
half a second.
- chunk_size (int) = 1024: The size of the chunks to receive, in bytes.
Commonly a multiple of 2 (1024, 2048, 4096). When in doubt, do not
change.
Return:
- (str): The HTTP response.
"""
url = urlparse.urlparse(url)
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.connect((url.netloc, url.port or 80))
sock.sendall("GET 0 HTTP/1.0 1".format(url.path or "/", CRLF))
data = receive_all(sock)
finally:
sock.shutdown(socket.SHUT_RDWR)
sock.close()
return data
if __name__ == "__main__":
print(get("http://www.google.com/robots.txt", chunk_size=65535))
References
1 PEP-8: String Quotes
2 PEP-8: Blank Lines
Review
Is the code pythonic?
The code looks quite clean to me, but it never gets pythonic enough ;)
Some PEP-8 issues:
Don't mix single and double quotes;1
Leave two blank lines between the constants and the first function / class definition.2
Don't use
repr()
:repr()
is meant to get a printable (human readable)
version of an object. In this case, you don't even need to modify the chunks in any way, because they are already of the correct type (str
).You should consider a longer connection timeout. I find a second to be acceptable, but this of course depends on your use case.
There's usually no reason to set
socket.SO_REUSEADDR
. Regardless, doing so only has an effect if you bind the socket to an address after having done so.socket.socket.send()
does not guarantee that all of what you passed it was sent, instead, it returns the number of bytes sent. You may want to put that part in a loop to assure all data was sent. Or considersocket.socket.sendall()
.If you choose to explicitly call
socket.socket.shutdown()
, make sure you get the argument right.1
corresponds tosocket.SHUT_WR
, but you need to prevent reading and writing. Use module constants!socket.SHUT_RDWR
is the answer.I see you added a docstring for
receive_all()
. That's a nice start, but the docstring is a little generic. A more elaborate description of the arguments and return type and value would be helpful. The same goes forget()
. If this were a library,get()
would be part of the API, so documenting it should be a priority.Make sure the documentation is accurate. 'Gather all the data from a request' makes it sound like the socket already made a request and the function merely modifies the response in some way and then returns it. That may give people the false impression that the function has no side effects, but it does!
Let's say you extend this piece of code, and add more helper functions. If one of them raises an exception, the socket is never closed, and you leave the server hanging. Luckily, Python sockets provide context manager capabilities out of the box. That means we can make sure resources are cleaned up:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.settimeout(timeout)
...
return dataTurns out this has only been a feature since Python 3.2. One of the many reasons to switch to Python 3. Are you being forced to work with Python 2? A
try
/finally
is probably the closest we can get:try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
...
finally:
sock.shutdown(socket.SHUT_RDWR)
sock.close()
Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?
It's a good idea to extract functionality and put it in its own function, so yes, it does make the code cleaner. Should you be a big fan of OOP, a custom socket class extending from socket.socket
, with a receive_all()
method, would be the solution.
My take on it
import socket
import urlparse
CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
CRLF = "rnrn"
socket.setdefaulttimeout(CONNECTION_TIMEOUT)
def receive_all(sock, chunk_size=CHUNK_SIZE):
"""Receive all data from a socket, until EOF is reached.
Arguments:
- sock (socket.socket): The socket to receive from. This socket must
be in a valid state, where it can receive data.
- chunk_size (int) = 1024: The size of the chunks to receive, in
bytes. Commonly a multiple of 2 (1024, 2048, 4096). When in doubt,
do not change.
Return:
- (str) All the data received.
"""
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break
return "".join(chunks)
def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
"""Get the HTTP response from a webserver.
Arguments:
- url (str): A URL (optionally with a scheme attached).
- timeout (int / float) = 0.3: The number of seconds to wait before
timing out. May also be less than 1; a timeout of 0.5 corresponds to
half a second.
- chunk_size (int) = 1024: The size of the chunks to receive, in bytes.
Commonly a multiple of 2 (1024, 2048, 4096). When in doubt, do not
change.
Return:
- (str): The HTTP response.
"""
url = urlparse.urlparse(url)
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.connect((url.netloc, url.port or 80))
sock.sendall("GET 0 HTTP/1.0 1".format(url.path or "/", CRLF))
data = receive_all(sock)
finally:
sock.shutdown(socket.SHUT_RDWR)
sock.close()
return data
if __name__ == "__main__":
print(get("http://www.google.com/robots.txt", chunk_size=65535))
References
1 PEP-8: String Quotes
2 PEP-8: Blank Lines
edited Jun 9 at 12:57
answered Jun 8 at 18:21
Daniel
4,1132836
4,1132836
add a comment |Â
add a comment |Â
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f196128%2fconstruct-and-send-an-http-get-request-from-scratch-and-print-all-the-received-d%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
socket.setdefaulttimeout()
is a function. Are you sure you didn't mean to call it withCONNECTION_TIMEOUT
as argument?â Daniel
Jun 8 at 17:56
@Coal_ Thanks for pointing out that bug
â Ricky Wilson
Jun 8 at 18:01
Does this code work correctly?
â Phrancis
Jun 8 at 18:11
@Phrancis Just tested it, works fine for Python 2.
â Daniel
Jun 8 at 18:14