Construct and send an HTTP get request from scratch and print all the received data to the screen

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
3
down vote

favorite












My concerns:

Is the code pythonic?

Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?



import socket
import urlparse

CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
socket.setdefaulttimeout = CONNECTION_TIMEOUT
CRLF = "rnrn"

def receive_all(sock, chunk_size=CHUNK_SIZE):
'''
Gather all the data from a request.
'''
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break

return repr(''.join(chunks))


def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
url = urlparse.urlparse(url)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.connect((url.netloc, url.port or 80))
sock.send('GET 0 HTTP/1.0 1'.format(url.path or '/', CRLF))
data = receive_all(sock)
sock.shutdown(1)
sock.close()
return data

print(get('http://www.google.com/robots.txt', chunk_size=65535))






share|improve this question





















  • socket.setdefaulttimeout() is a function. Are you sure you didn't mean to call it with CONNECTION_TIMEOUT as argument?
    – Daniel
    Jun 8 at 17:56










  • @Coal_ Thanks for pointing out that bug
    – Ricky Wilson
    Jun 8 at 18:01










  • Does this code work correctly?
    – Phrancis
    Jun 8 at 18:11










  • @Phrancis Just tested it, works fine for Python 2.
    – Daniel
    Jun 8 at 18:14
















up vote
3
down vote

favorite












My concerns:

Is the code pythonic?

Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?



import socket
import urlparse

CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
socket.setdefaulttimeout = CONNECTION_TIMEOUT
CRLF = "rnrn"

def receive_all(sock, chunk_size=CHUNK_SIZE):
'''
Gather all the data from a request.
'''
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break

return repr(''.join(chunks))


def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
url = urlparse.urlparse(url)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.connect((url.netloc, url.port or 80))
sock.send('GET 0 HTTP/1.0 1'.format(url.path or '/', CRLF))
data = receive_all(sock)
sock.shutdown(1)
sock.close()
return data

print(get('http://www.google.com/robots.txt', chunk_size=65535))






share|improve this question





















  • socket.setdefaulttimeout() is a function. Are you sure you didn't mean to call it with CONNECTION_TIMEOUT as argument?
    – Daniel
    Jun 8 at 17:56










  • @Coal_ Thanks for pointing out that bug
    – Ricky Wilson
    Jun 8 at 18:01










  • Does this code work correctly?
    – Phrancis
    Jun 8 at 18:11










  • @Phrancis Just tested it, works fine for Python 2.
    – Daniel
    Jun 8 at 18:14












up vote
3
down vote

favorite









up vote
3
down vote

favorite











My concerns:

Is the code pythonic?

Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?



import socket
import urlparse

CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
socket.setdefaulttimeout = CONNECTION_TIMEOUT
CRLF = "rnrn"

def receive_all(sock, chunk_size=CHUNK_SIZE):
'''
Gather all the data from a request.
'''
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break

return repr(''.join(chunks))


def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
url = urlparse.urlparse(url)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.connect((url.netloc, url.port or 80))
sock.send('GET 0 HTTP/1.0 1'.format(url.path or '/', CRLF))
data = receive_all(sock)
sock.shutdown(1)
sock.close()
return data

print(get('http://www.google.com/robots.txt', chunk_size=65535))






share|improve this question













My concerns:

Is the code pythonic?

Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?



import socket
import urlparse

CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
socket.setdefaulttimeout = CONNECTION_TIMEOUT
CRLF = "rnrn"

def receive_all(sock, chunk_size=CHUNK_SIZE):
'''
Gather all the data from a request.
'''
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break

return repr(''.join(chunks))


def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
url = urlparse.urlparse(url)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.connect((url.netloc, url.port or 80))
sock.send('GET 0 HTTP/1.0 1'.format(url.path or '/', CRLF))
data = receive_all(sock)
sock.shutdown(1)
sock.close()
return data

print(get('http://www.google.com/robots.txt', chunk_size=65535))








share|improve this question












share|improve this question




share|improve this question








edited Jun 9 at 7:44









Mathias Ettinger

21.8k32875




21.8k32875









asked Jun 8 at 17:51









Ricky Wilson

820915




820915











  • socket.setdefaulttimeout() is a function. Are you sure you didn't mean to call it with CONNECTION_TIMEOUT as argument?
    – Daniel
    Jun 8 at 17:56










  • @Coal_ Thanks for pointing out that bug
    – Ricky Wilson
    Jun 8 at 18:01










  • Does this code work correctly?
    – Phrancis
    Jun 8 at 18:11










  • @Phrancis Just tested it, works fine for Python 2.
    – Daniel
    Jun 8 at 18:14
















  • socket.setdefaulttimeout() is a function. Are you sure you didn't mean to call it with CONNECTION_TIMEOUT as argument?
    – Daniel
    Jun 8 at 17:56










  • @Coal_ Thanks for pointing out that bug
    – Ricky Wilson
    Jun 8 at 18:01










  • Does this code work correctly?
    – Phrancis
    Jun 8 at 18:11










  • @Phrancis Just tested it, works fine for Python 2.
    – Daniel
    Jun 8 at 18:14















socket.setdefaulttimeout() is a function. Are you sure you didn't mean to call it with CONNECTION_TIMEOUT as argument?
– Daniel
Jun 8 at 17:56




socket.setdefaulttimeout() is a function. Are you sure you didn't mean to call it with CONNECTION_TIMEOUT as argument?
– Daniel
Jun 8 at 17:56












@Coal_ Thanks for pointing out that bug
– Ricky Wilson
Jun 8 at 18:01




@Coal_ Thanks for pointing out that bug
– Ricky Wilson
Jun 8 at 18:01












Does this code work correctly?
– Phrancis
Jun 8 at 18:11




Does this code work correctly?
– Phrancis
Jun 8 at 18:11












@Phrancis Just tested it, works fine for Python 2.
– Daniel
Jun 8 at 18:14




@Phrancis Just tested it, works fine for Python 2.
– Daniel
Jun 8 at 18:14










1 Answer
1






active

oldest

votes

















up vote
3
down vote













Review




Is the code pythonic?




The code looks quite clean to me, but it never gets pythonic enough ;)




  1. Some PEP-8 issues:



    • Don't mix single and double quotes;1


    • Leave two blank lines between the constants and the first function / class definition.2



  2. Don't use repr(): repr() is meant to get a printable (human readable)
    version of an object. In this case, you don't even need to modify the chunks in any way, because they are already of the correct type (str).


  3. You should consider a longer connection timeout. I find a second to be acceptable, but this of course depends on your use case.


  4. There's usually no reason to set socket.SO_REUSEADDR. Regardless, doing so only has an effect if you bind the socket to an address after having done so.


  5. socket.socket.send() does not guarantee that all of what you passed it was sent, instead, it returns the number of bytes sent. You may want to put that part in a loop to assure all data was sent. Or consider socket.socket.sendall().


  6. If you choose to explicitly call socket.socket.shutdown(), make sure you get the argument right. 1 corresponds to socket.SHUT_WR, but you need to prevent reading and writing. Use module constants! socket.SHUT_RDWR is the answer.



  7. I see you added a docstring for receive_all(). That's a nice start, but the docstring is a little generic. A more elaborate description of the arguments and return type and value would be helpful. The same goes for get(). If this were a library, get() would be part of the API, so documenting it should be a priority.



    Make sure the documentation is accurate. 'Gather all the data from a request' makes it sound like the socket already made a request and the function merely modifies the response in some way and then returns it. That may give people the false impression that the function has no side effects, but it does!




  8. Let's say you extend this piece of code, and add more helper functions. If one of them raises an exception, the socket is never closed, and you leave the server hanging. Luckily, Python sockets provide context manager capabilities out of the box. That means we can make sure resources are cleaned up:



    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
    sock.settimeout(timeout)
    ...
    return data


    Turns out this has only been a feature since Python 3.2. One of the many reasons to switch to Python 3. Are you being forced to work with Python 2? A try / finally is probably the closest we can get:



    try:
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.settimeout(timeout)
    ...
    finally:
    sock.shutdown(socket.SHUT_RDWR)
    sock.close()



Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?




It's a good idea to extract functionality and put it in its own function, so yes, it does make the code cleaner. Should you be a big fan of OOP, a custom socket class extending from socket.socket, with a receive_all() method, would be the solution.



My take on it



import socket
import urlparse

CONNECTION_TIMEOUT = 0.30
CHUNK_SIZE = 1024
CRLF = "rnrn"

socket.setdefaulttimeout(CONNECTION_TIMEOUT)


def receive_all(sock, chunk_size=CHUNK_SIZE):
"""Receive all data from a socket, until EOF is reached.

Arguments:
- sock (socket.socket): The socket to receive from. This socket must
be in a valid state, where it can receive data.
- chunk_size (int) = 1024: The size of the chunks to receive, in
bytes. Commonly a multiple of 2 (1024, 2048, 4096). When in doubt,
do not change.

Return:
- (str) All the data received.
"""
chunks =
while True:
chunk = sock.recv(chunk_size)
if chunk:
chunks.append(chunk)
else:
break
return "".join(chunks)


def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
"""Get the HTTP response from a webserver.

Arguments:
- url (str): A URL (optionally with a scheme attached).
- timeout (int / float) = 0.3: The number of seconds to wait before
timing out. May also be less than 1; a timeout of 0.5 corresponds to
half a second.
- chunk_size (int) = 1024: The size of the chunks to receive, in bytes.
Commonly a multiple of 2 (1024, 2048, 4096). When in doubt, do not
change.

Return:
- (str): The HTTP response.
"""
url = urlparse.urlparse(url)
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.connect((url.netloc, url.port or 80))
sock.sendall("GET 0 HTTP/1.0 1".format(url.path or "/", CRLF))
data = receive_all(sock)
finally:
sock.shutdown(socket.SHUT_RDWR)
sock.close()
return data

if __name__ == "__main__":
print(get("http://www.google.com/robots.txt", chunk_size=65535))


References



1 PEP-8: String Quotes



2 PEP-8: Blank Lines






share|improve this answer























    Your Answer




    StackExchange.ifUsing("editor", function ()
    return StackExchange.using("mathjaxEditing", function ()
    StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
    StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
    );
    );
    , "mathjax-editing");

    StackExchange.ifUsing("editor", function ()
    StackExchange.using("externalEditor", function ()
    StackExchange.using("snippets", function ()
    StackExchange.snippets.init();
    );
    );
    , "code-snippets");

    StackExchange.ready(function()
    var channelOptions =
    tags: "".split(" "),
    id: "196"
    ;
    initTagRenderer("".split(" "), "".split(" "), channelOptions);

    StackExchange.using("externalEditor", function()
    // Have to fire editor after snippets, if snippets enabled
    if (StackExchange.settings.snippets.snippetsEnabled)
    StackExchange.using("snippets", function()
    createEditor();
    );

    else
    createEditor();

    );

    function createEditor()
    StackExchange.prepareEditor(
    heartbeatType: 'answer',
    convertImagesToLinks: false,
    noModals: false,
    showLowRepImageUploadWarning: true,
    reputationToPostImages: null,
    bindNavPrevention: true,
    postfix: "",
    onDemand: true,
    discardSelector: ".discard-answer"
    ,immediatelyShowMarkdownHelp:true
    );



    );








     

    draft saved


    draft discarded


















    StackExchange.ready(
    function ()
    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f196128%2fconstruct-and-send-an-http-get-request-from-scratch-and-print-all-the-received-d%23new-answer', 'question_page');

    );

    Post as a guest






























    1 Answer
    1






    active

    oldest

    votes








    1 Answer
    1






    active

    oldest

    votes









    active

    oldest

    votes






    active

    oldest

    votes








    up vote
    3
    down vote













    Review




    Is the code pythonic?




    The code looks quite clean to me, but it never gets pythonic enough ;)




    1. Some PEP-8 issues:



      • Don't mix single and double quotes;1


      • Leave two blank lines between the constants and the first function / class definition.2



    2. Don't use repr(): repr() is meant to get a printable (human readable)
      version of an object. In this case, you don't even need to modify the chunks in any way, because they are already of the correct type (str).


    3. You should consider a longer connection timeout. I find a second to be acceptable, but this of course depends on your use case.


    4. There's usually no reason to set socket.SO_REUSEADDR. Regardless, doing so only has an effect if you bind the socket to an address after having done so.


    5. socket.socket.send() does not guarantee that all of what you passed it was sent, instead, it returns the number of bytes sent. You may want to put that part in a loop to assure all data was sent. Or consider socket.socket.sendall().


    6. If you choose to explicitly call socket.socket.shutdown(), make sure you get the argument right. 1 corresponds to socket.SHUT_WR, but you need to prevent reading and writing. Use module constants! socket.SHUT_RDWR is the answer.



    7. I see you added a docstring for receive_all(). That's a nice start, but the docstring is a little generic. A more elaborate description of the arguments and return type and value would be helpful. The same goes for get(). If this were a library, get() would be part of the API, so documenting it should be a priority.



      Make sure the documentation is accurate. 'Gather all the data from a request' makes it sound like the socket already made a request and the function merely modifies the response in some way and then returns it. That may give people the false impression that the function has no side effects, but it does!




    8. Let's say you extend this piece of code, and add more helper functions. If one of them raises an exception, the socket is never closed, and you leave the server hanging. Luckily, Python sockets provide context manager capabilities out of the box. That means we can make sure resources are cleaned up:



      with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
      sock.settimeout(timeout)
      ...
      return data


      Turns out this has only been a feature since Python 3.2. One of the many reasons to switch to Python 3. Are you being forced to work with Python 2? A try / finally is probably the closest we can get:



      try:
      sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
      sock.settimeout(timeout)
      ...
      finally:
      sock.shutdown(socket.SHUT_RDWR)
      sock.close()



    Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?




    It's a good idea to extract functionality and put it in its own function, so yes, it does make the code cleaner. Should you be a big fan of OOP, a custom socket class extending from socket.socket, with a receive_all() method, would be the solution.



    My take on it



    import socket
    import urlparse

    CONNECTION_TIMEOUT = 0.30
    CHUNK_SIZE = 1024
    CRLF = "rnrn"

    socket.setdefaulttimeout(CONNECTION_TIMEOUT)


    def receive_all(sock, chunk_size=CHUNK_SIZE):
    """Receive all data from a socket, until EOF is reached.

    Arguments:
    - sock (socket.socket): The socket to receive from. This socket must
    be in a valid state, where it can receive data.
    - chunk_size (int) = 1024: The size of the chunks to receive, in
    bytes. Commonly a multiple of 2 (1024, 2048, 4096). When in doubt,
    do not change.

    Return:
    - (str) All the data received.
    """
    chunks =
    while True:
    chunk = sock.recv(chunk_size)
    if chunk:
    chunks.append(chunk)
    else:
    break
    return "".join(chunks)


    def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
    """Get the HTTP response from a webserver.

    Arguments:
    - url (str): A URL (optionally with a scheme attached).
    - timeout (int / float) = 0.3: The number of seconds to wait before
    timing out. May also be less than 1; a timeout of 0.5 corresponds to
    half a second.
    - chunk_size (int) = 1024: The size of the chunks to receive, in bytes.
    Commonly a multiple of 2 (1024, 2048, 4096). When in doubt, do not
    change.

    Return:
    - (str): The HTTP response.
    """
    url = urlparse.urlparse(url)
    try:
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.settimeout(timeout)
    sock.connect((url.netloc, url.port or 80))
    sock.sendall("GET 0 HTTP/1.0 1".format(url.path or "/", CRLF))
    data = receive_all(sock)
    finally:
    sock.shutdown(socket.SHUT_RDWR)
    sock.close()
    return data

    if __name__ == "__main__":
    print(get("http://www.google.com/robots.txt", chunk_size=65535))


    References



    1 PEP-8: String Quotes



    2 PEP-8: Blank Lines






    share|improve this answer



























      up vote
      3
      down vote













      Review




      Is the code pythonic?




      The code looks quite clean to me, but it never gets pythonic enough ;)




      1. Some PEP-8 issues:



        • Don't mix single and double quotes;1


        • Leave two blank lines between the constants and the first function / class definition.2



      2. Don't use repr(): repr() is meant to get a printable (human readable)
        version of an object. In this case, you don't even need to modify the chunks in any way, because they are already of the correct type (str).


      3. You should consider a longer connection timeout. I find a second to be acceptable, but this of course depends on your use case.


      4. There's usually no reason to set socket.SO_REUSEADDR. Regardless, doing so only has an effect if you bind the socket to an address after having done so.


      5. socket.socket.send() does not guarantee that all of what you passed it was sent, instead, it returns the number of bytes sent. You may want to put that part in a loop to assure all data was sent. Or consider socket.socket.sendall().


      6. If you choose to explicitly call socket.socket.shutdown(), make sure you get the argument right. 1 corresponds to socket.SHUT_WR, but you need to prevent reading and writing. Use module constants! socket.SHUT_RDWR is the answer.



      7. I see you added a docstring for receive_all(). That's a nice start, but the docstring is a little generic. A more elaborate description of the arguments and return type and value would be helpful. The same goes for get(). If this were a library, get() would be part of the API, so documenting it should be a priority.



        Make sure the documentation is accurate. 'Gather all the data from a request' makes it sound like the socket already made a request and the function merely modifies the response in some way and then returns it. That may give people the false impression that the function has no side effects, but it does!




      8. Let's say you extend this piece of code, and add more helper functions. If one of them raises an exception, the socket is never closed, and you leave the server hanging. Luckily, Python sockets provide context manager capabilities out of the box. That means we can make sure resources are cleaned up:



        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
        sock.settimeout(timeout)
        ...
        return data


        Turns out this has only been a feature since Python 3.2. One of the many reasons to switch to Python 3. Are you being forced to work with Python 2? A try / finally is probably the closest we can get:



        try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(timeout)
        ...
        finally:
        sock.shutdown(socket.SHUT_RDWR)
        sock.close()



      Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?




      It's a good idea to extract functionality and put it in its own function, so yes, it does make the code cleaner. Should you be a big fan of OOP, a custom socket class extending from socket.socket, with a receive_all() method, would be the solution.



      My take on it



      import socket
      import urlparse

      CONNECTION_TIMEOUT = 0.30
      CHUNK_SIZE = 1024
      CRLF = "rnrn"

      socket.setdefaulttimeout(CONNECTION_TIMEOUT)


      def receive_all(sock, chunk_size=CHUNK_SIZE):
      """Receive all data from a socket, until EOF is reached.

      Arguments:
      - sock (socket.socket): The socket to receive from. This socket must
      be in a valid state, where it can receive data.
      - chunk_size (int) = 1024: The size of the chunks to receive, in
      bytes. Commonly a multiple of 2 (1024, 2048, 4096). When in doubt,
      do not change.

      Return:
      - (str) All the data received.
      """
      chunks =
      while True:
      chunk = sock.recv(chunk_size)
      if chunk:
      chunks.append(chunk)
      else:
      break
      return "".join(chunks)


      def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
      """Get the HTTP response from a webserver.

      Arguments:
      - url (str): A URL (optionally with a scheme attached).
      - timeout (int / float) = 0.3: The number of seconds to wait before
      timing out. May also be less than 1; a timeout of 0.5 corresponds to
      half a second.
      - chunk_size (int) = 1024: The size of the chunks to receive, in bytes.
      Commonly a multiple of 2 (1024, 2048, 4096). When in doubt, do not
      change.

      Return:
      - (str): The HTTP response.
      """
      url = urlparse.urlparse(url)
      try:
      sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
      sock.settimeout(timeout)
      sock.connect((url.netloc, url.port or 80))
      sock.sendall("GET 0 HTTP/1.0 1".format(url.path or "/", CRLF))
      data = receive_all(sock)
      finally:
      sock.shutdown(socket.SHUT_RDWR)
      sock.close()
      return data

      if __name__ == "__main__":
      print(get("http://www.google.com/robots.txt", chunk_size=65535))


      References



      1 PEP-8: String Quotes



      2 PEP-8: Blank Lines






      share|improve this answer

























        up vote
        3
        down vote










        up vote
        3
        down vote









        Review




        Is the code pythonic?




        The code looks quite clean to me, but it never gets pythonic enough ;)




        1. Some PEP-8 issues:



          • Don't mix single and double quotes;1


          • Leave two blank lines between the constants and the first function / class definition.2



        2. Don't use repr(): repr() is meant to get a printable (human readable)
          version of an object. In this case, you don't even need to modify the chunks in any way, because they are already of the correct type (str).


        3. You should consider a longer connection timeout. I find a second to be acceptable, but this of course depends on your use case.


        4. There's usually no reason to set socket.SO_REUSEADDR. Regardless, doing so only has an effect if you bind the socket to an address after having done so.


        5. socket.socket.send() does not guarantee that all of what you passed it was sent, instead, it returns the number of bytes sent. You may want to put that part in a loop to assure all data was sent. Or consider socket.socket.sendall().


        6. If you choose to explicitly call socket.socket.shutdown(), make sure you get the argument right. 1 corresponds to socket.SHUT_WR, but you need to prevent reading and writing. Use module constants! socket.SHUT_RDWR is the answer.



        7. I see you added a docstring for receive_all(). That's a nice start, but the docstring is a little generic. A more elaborate description of the arguments and return type and value would be helpful. The same goes for get(). If this were a library, get() would be part of the API, so documenting it should be a priority.



          Make sure the documentation is accurate. 'Gather all the data from a request' makes it sound like the socket already made a request and the function merely modifies the response in some way and then returns it. That may give people the false impression that the function has no side effects, but it does!




        8. Let's say you extend this piece of code, and add more helper functions. If one of them raises an exception, the socket is never closed, and you leave the server hanging. Luckily, Python sockets provide context manager capabilities out of the box. That means we can make sure resources are cleaned up:



          with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
          sock.settimeout(timeout)
          ...
          return data


          Turns out this has only been a feature since Python 3.2. One of the many reasons to switch to Python 3. Are you being forced to work with Python 2? A try / finally is probably the closest we can get:



          try:
          sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
          sock.settimeout(timeout)
          ...
          finally:
          sock.shutdown(socket.SHUT_RDWR)
          sock.close()



        Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?




        It's a good idea to extract functionality and put it in its own function, so yes, it does make the code cleaner. Should you be a big fan of OOP, a custom socket class extending from socket.socket, with a receive_all() method, would be the solution.



        My take on it



        import socket
        import urlparse

        CONNECTION_TIMEOUT = 0.30
        CHUNK_SIZE = 1024
        CRLF = "rnrn"

        socket.setdefaulttimeout(CONNECTION_TIMEOUT)


        def receive_all(sock, chunk_size=CHUNK_SIZE):
        """Receive all data from a socket, until EOF is reached.

        Arguments:
        - sock (socket.socket): The socket to receive from. This socket must
        be in a valid state, where it can receive data.
        - chunk_size (int) = 1024: The size of the chunks to receive, in
        bytes. Commonly a multiple of 2 (1024, 2048, 4096). When in doubt,
        do not change.

        Return:
        - (str) All the data received.
        """
        chunks =
        while True:
        chunk = sock.recv(chunk_size)
        if chunk:
        chunks.append(chunk)
        else:
        break
        return "".join(chunks)


        def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
        """Get the HTTP response from a webserver.

        Arguments:
        - url (str): A URL (optionally with a scheme attached).
        - timeout (int / float) = 0.3: The number of seconds to wait before
        timing out. May also be less than 1; a timeout of 0.5 corresponds to
        half a second.
        - chunk_size (int) = 1024: The size of the chunks to receive, in bytes.
        Commonly a multiple of 2 (1024, 2048, 4096). When in doubt, do not
        change.

        Return:
        - (str): The HTTP response.
        """
        url = urlparse.urlparse(url)
        try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(timeout)
        sock.connect((url.netloc, url.port or 80))
        sock.sendall("GET 0 HTTP/1.0 1".format(url.path or "/", CRLF))
        data = receive_all(sock)
        finally:
        sock.shutdown(socket.SHUT_RDWR)
        sock.close()
        return data

        if __name__ == "__main__":
        print(get("http://www.google.com/robots.txt", chunk_size=65535))


        References



        1 PEP-8: String Quotes



        2 PEP-8: Blank Lines






        share|improve this answer















        Review




        Is the code pythonic?




        The code looks quite clean to me, but it never gets pythonic enough ;)




        1. Some PEP-8 issues:



          • Don't mix single and double quotes;1


          • Leave two blank lines between the constants and the first function / class definition.2



        2. Don't use repr(): repr() is meant to get a printable (human readable)
          version of an object. In this case, you don't even need to modify the chunks in any way, because they are already of the correct type (str).


        3. You should consider a longer connection timeout. I find a second to be acceptable, but this of course depends on your use case.


        4. There's usually no reason to set socket.SO_REUSEADDR. Regardless, doing so only has an effect if you bind the socket to an address after having done so.


        5. socket.socket.send() does not guarantee that all of what you passed it was sent, instead, it returns the number of bytes sent. You may want to put that part in a loop to assure all data was sent. Or consider socket.socket.sendall().


        6. If you choose to explicitly call socket.socket.shutdown(), make sure you get the argument right. 1 corresponds to socket.SHUT_WR, but you need to prevent reading and writing. Use module constants! socket.SHUT_RDWR is the answer.



        7. I see you added a docstring for receive_all(). That's a nice start, but the docstring is a little generic. A more elaborate description of the arguments and return type and value would be helpful. The same goes for get(). If this were a library, get() would be part of the API, so documenting it should be a priority.



          Make sure the documentation is accurate. 'Gather all the data from a request' makes it sound like the socket already made a request and the function merely modifies the response in some way and then returns it. That may give people the false impression that the function has no side effects, but it does!




        8. Let's say you extend this piece of code, and add more helper functions. If one of them raises an exception, the socket is never closed, and you leave the server hanging. Luckily, Python sockets provide context manager capabilities out of the box. That means we can make sure resources are cleaned up:



          with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
          sock.settimeout(timeout)
          ...
          return data


          Turns out this has only been a feature since Python 3.2. One of the many reasons to switch to Python 3. Are you being forced to work with Python 2? A try / finally is probably the closest we can get:



          try:
          sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
          sock.settimeout(timeout)
          ...
          finally:
          sock.shutdown(socket.SHUT_RDWR)
          sock.close()



        Does the receive_all function make the code any cleaner or should that functionally be apart of the get function?




        It's a good idea to extract functionality and put it in its own function, so yes, it does make the code cleaner. Should you be a big fan of OOP, a custom socket class extending from socket.socket, with a receive_all() method, would be the solution.



        My take on it



        import socket
        import urlparse

        CONNECTION_TIMEOUT = 0.30
        CHUNK_SIZE = 1024
        CRLF = "rnrn"

        socket.setdefaulttimeout(CONNECTION_TIMEOUT)


        def receive_all(sock, chunk_size=CHUNK_SIZE):
        """Receive all data from a socket, until EOF is reached.

        Arguments:
        - sock (socket.socket): The socket to receive from. This socket must
        be in a valid state, where it can receive data.
        - chunk_size (int) = 1024: The size of the chunks to receive, in
        bytes. Commonly a multiple of 2 (1024, 2048, 4096). When in doubt,
        do not change.

        Return:
        - (str) All the data received.
        """
        chunks =
        while True:
        chunk = sock.recv(chunk_size)
        if chunk:
        chunks.append(chunk)
        else:
        break
        return "".join(chunks)


        def get(url, timeout=CONNECTION_TIMEOUT, chunk_size=CHUNK_SIZE):
        """Get the HTTP response from a webserver.

        Arguments:
        - url (str): A URL (optionally with a scheme attached).
        - timeout (int / float) = 0.3: The number of seconds to wait before
        timing out. May also be less than 1; a timeout of 0.5 corresponds to
        half a second.
        - chunk_size (int) = 1024: The size of the chunks to receive, in bytes.
        Commonly a multiple of 2 (1024, 2048, 4096). When in doubt, do not
        change.

        Return:
        - (str): The HTTP response.
        """
        url = urlparse.urlparse(url)
        try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(timeout)
        sock.connect((url.netloc, url.port or 80))
        sock.sendall("GET 0 HTTP/1.0 1".format(url.path or "/", CRLF))
        data = receive_all(sock)
        finally:
        sock.shutdown(socket.SHUT_RDWR)
        sock.close()
        return data

        if __name__ == "__main__":
        print(get("http://www.google.com/robots.txt", chunk_size=65535))


        References



        1 PEP-8: String Quotes



        2 PEP-8: Blank Lines







        share|improve this answer















        share|improve this answer



        share|improve this answer








        edited Jun 9 at 12:57


























        answered Jun 8 at 18:21









        Daniel

        4,1132836




        4,1132836






















             

            draft saved


            draft discarded


























             


            draft saved


            draft discarded














            StackExchange.ready(
            function ()
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f196128%2fconstruct-and-send-an-http-get-request-from-scratch-and-print-all-the-received-d%23new-answer', 'question_page');

            );

            Post as a guest













































































            Popular posts from this blog

            Greedy Best First Search implementation in Rust

            Function to Return a JSON Like Objects Using VBA Collections and Arrays

            C++11 CLH Lock Implementation