Loop through a list of URLs multi-threading and check return code in python

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
0
down vote

favorite
1












I have to loop through a list of over 4000 urls and check their http return code in python.



Url.txt: Contains a list of 4000 urls with one url per line.



The script takes a long time to run and I wanted to incorporate multi-threading to improve speed but not sure if I have done it properly.



It sure doesn't seem like it is working fast enough.



#! /usr/bin/python

# To just check a site and get the URL code
#import urllib.request
#print(urllib.request.urlopen("http://www.stackoverflow.com").getcode())
#############################################################################

import time
import requests

start = time.time()

from multiprocessing.dummy import Pool
pool = Pool(8) # Number of concurrent threads

#input file
URLS = open("url.txt","r")

#output file
file = open('output.csv', 'w')

#############################################################################

GREEN = '33[92m'
YELLOW = '33[93m'
RED = '33[91m'
ENDC = '33[0m'


def main():
with open('url.txt') as f:

url = f.read().splitlines()
print( "nTesting URLs.", time.ctime())

all_text = pool.map(checkUrls,url)
print("closing p")
pool.close()
pool.join()
#checkUrls()
print("Press CTRL+C to exit")
#I don't need this sleep any longer. Can I remove the next line?
time.sleep(100000) #Sleep 10 seconds

def checkUrls(url):
count = 0
status = "N/A"
try:
status = checkUrl(url)
except requests.exceptions.ConnectionError:
status = "DOWN"
except requests.exceptions.HTTPError:
status = "HttpError"
except requests.exceptions.ProxyError:
status = "ProxyError"
except requests.exceptions.Timeout:
status = "TimeoutError"
except requests.exceptions.ConnectTimeout:
status = "connectTimeout"
except requests.exceptions.ReadTimeout:
status = "ReadTimeout"
except requests.exceptions.TooManyRedirects:
status = "TooManyRedirects"
except requests.exceptions.MissingSchema:
status = "MissingSchema"
except requests.exceptions.InvalidURL:
status = "InvalidURL"
except requests.exceptions.InvalidHeader:
status = "InvalidHeader"
except requests.exceptions.URLRequired:
status = "URLmissing"
except requests.exceptions.InvalidProxyURL:
status = "InvalidProxy"
except requests.exceptions.RetryError:
status = "RetryError"
except requests.exceptions.InvalidSchema:
status = "InvalidSchema"

printStatus(url, status, count)

count+=1
time_elapsed = datetime.now() - start_time


def checkUrl(url):
r = requests.get(url, timeout=5)
#print r.status_code
return str(r.status_code)


def printStatus(url, status, count):
color = GREEN

count= count+1
if status != "200":
color=RED

#print(color+status+ENDC+' '+ url)
print(str(count)+'t' + color+status+ENDC+' '+ url)
file.write(str(count)+'t' + color+status+ENDC+' '+ url +'n')

#print('Time elapsed (hh:mm:ss.ms) '.format(time_elapsed))

end = time.time()
print(end - start)

# Main app
#
if __name__ == '__main__':
main()






share|improve this question

























    up vote
    0
    down vote

    favorite
    1












    I have to loop through a list of over 4000 urls and check their http return code in python.



    Url.txt: Contains a list of 4000 urls with one url per line.



    The script takes a long time to run and I wanted to incorporate multi-threading to improve speed but not sure if I have done it properly.



    It sure doesn't seem like it is working fast enough.



    #! /usr/bin/python

    # To just check a site and get the URL code
    #import urllib.request
    #print(urllib.request.urlopen("http://www.stackoverflow.com").getcode())
    #############################################################################

    import time
    import requests

    start = time.time()

    from multiprocessing.dummy import Pool
    pool = Pool(8) # Number of concurrent threads

    #input file
    URLS = open("url.txt","r")

    #output file
    file = open('output.csv', 'w')

    #############################################################################

    GREEN = '33[92m'
    YELLOW = '33[93m'
    RED = '33[91m'
    ENDC = '33[0m'


    def main():
    with open('url.txt') as f:

    url = f.read().splitlines()
    print( "nTesting URLs.", time.ctime())

    all_text = pool.map(checkUrls,url)
    print("closing p")
    pool.close()
    pool.join()
    #checkUrls()
    print("Press CTRL+C to exit")
    #I don't need this sleep any longer. Can I remove the next line?
    time.sleep(100000) #Sleep 10 seconds

    def checkUrls(url):
    count = 0
    status = "N/A"
    try:
    status = checkUrl(url)
    except requests.exceptions.ConnectionError:
    status = "DOWN"
    except requests.exceptions.HTTPError:
    status = "HttpError"
    except requests.exceptions.ProxyError:
    status = "ProxyError"
    except requests.exceptions.Timeout:
    status = "TimeoutError"
    except requests.exceptions.ConnectTimeout:
    status = "connectTimeout"
    except requests.exceptions.ReadTimeout:
    status = "ReadTimeout"
    except requests.exceptions.TooManyRedirects:
    status = "TooManyRedirects"
    except requests.exceptions.MissingSchema:
    status = "MissingSchema"
    except requests.exceptions.InvalidURL:
    status = "InvalidURL"
    except requests.exceptions.InvalidHeader:
    status = "InvalidHeader"
    except requests.exceptions.URLRequired:
    status = "URLmissing"
    except requests.exceptions.InvalidProxyURL:
    status = "InvalidProxy"
    except requests.exceptions.RetryError:
    status = "RetryError"
    except requests.exceptions.InvalidSchema:
    status = "InvalidSchema"

    printStatus(url, status, count)

    count+=1
    time_elapsed = datetime.now() - start_time


    def checkUrl(url):
    r = requests.get(url, timeout=5)
    #print r.status_code
    return str(r.status_code)


    def printStatus(url, status, count):
    color = GREEN

    count= count+1
    if status != "200":
    color=RED

    #print(color+status+ENDC+' '+ url)
    print(str(count)+'t' + color+status+ENDC+' '+ url)
    file.write(str(count)+'t' + color+status+ENDC+' '+ url +'n')

    #print('Time elapsed (hh:mm:ss.ms) '.format(time_elapsed))

    end = time.time()
    print(end - start)

    # Main app
    #
    if __name__ == '__main__':
    main()






    share|improve this question





















      up vote
      0
      down vote

      favorite
      1









      up vote
      0
      down vote

      favorite
      1






      1





      I have to loop through a list of over 4000 urls and check their http return code in python.



      Url.txt: Contains a list of 4000 urls with one url per line.



      The script takes a long time to run and I wanted to incorporate multi-threading to improve speed but not sure if I have done it properly.



      It sure doesn't seem like it is working fast enough.



      #! /usr/bin/python

      # To just check a site and get the URL code
      #import urllib.request
      #print(urllib.request.urlopen("http://www.stackoverflow.com").getcode())
      #############################################################################

      import time
      import requests

      start = time.time()

      from multiprocessing.dummy import Pool
      pool = Pool(8) # Number of concurrent threads

      #input file
      URLS = open("url.txt","r")

      #output file
      file = open('output.csv', 'w')

      #############################################################################

      GREEN = '33[92m'
      YELLOW = '33[93m'
      RED = '33[91m'
      ENDC = '33[0m'


      def main():
      with open('url.txt') as f:

      url = f.read().splitlines()
      print( "nTesting URLs.", time.ctime())

      all_text = pool.map(checkUrls,url)
      print("closing p")
      pool.close()
      pool.join()
      #checkUrls()
      print("Press CTRL+C to exit")
      #I don't need this sleep any longer. Can I remove the next line?
      time.sleep(100000) #Sleep 10 seconds

      def checkUrls(url):
      count = 0
      status = "N/A"
      try:
      status = checkUrl(url)
      except requests.exceptions.ConnectionError:
      status = "DOWN"
      except requests.exceptions.HTTPError:
      status = "HttpError"
      except requests.exceptions.ProxyError:
      status = "ProxyError"
      except requests.exceptions.Timeout:
      status = "TimeoutError"
      except requests.exceptions.ConnectTimeout:
      status = "connectTimeout"
      except requests.exceptions.ReadTimeout:
      status = "ReadTimeout"
      except requests.exceptions.TooManyRedirects:
      status = "TooManyRedirects"
      except requests.exceptions.MissingSchema:
      status = "MissingSchema"
      except requests.exceptions.InvalidURL:
      status = "InvalidURL"
      except requests.exceptions.InvalidHeader:
      status = "InvalidHeader"
      except requests.exceptions.URLRequired:
      status = "URLmissing"
      except requests.exceptions.InvalidProxyURL:
      status = "InvalidProxy"
      except requests.exceptions.RetryError:
      status = "RetryError"
      except requests.exceptions.InvalidSchema:
      status = "InvalidSchema"

      printStatus(url, status, count)

      count+=1
      time_elapsed = datetime.now() - start_time


      def checkUrl(url):
      r = requests.get(url, timeout=5)
      #print r.status_code
      return str(r.status_code)


      def printStatus(url, status, count):
      color = GREEN

      count= count+1
      if status != "200":
      color=RED

      #print(color+status+ENDC+' '+ url)
      print(str(count)+'t' + color+status+ENDC+' '+ url)
      file.write(str(count)+'t' + color+status+ENDC+' '+ url +'n')

      #print('Time elapsed (hh:mm:ss.ms) '.format(time_elapsed))

      end = time.time()
      print(end - start)

      # Main app
      #
      if __name__ == '__main__':
      main()






      share|improve this question











      I have to loop through a list of over 4000 urls and check their http return code in python.



      Url.txt: Contains a list of 4000 urls with one url per line.



      The script takes a long time to run and I wanted to incorporate multi-threading to improve speed but not sure if I have done it properly.



      It sure doesn't seem like it is working fast enough.



      #! /usr/bin/python

      # To just check a site and get the URL code
      #import urllib.request
      #print(urllib.request.urlopen("http://www.stackoverflow.com").getcode())
      #############################################################################

      import time
      import requests

      start = time.time()

      from multiprocessing.dummy import Pool
      pool = Pool(8) # Number of concurrent threads

      #input file
      URLS = open("url.txt","r")

      #output file
      file = open('output.csv', 'w')

      #############################################################################

      GREEN = '33[92m'
      YELLOW = '33[93m'
      RED = '33[91m'
      ENDC = '33[0m'


      def main():
      with open('url.txt') as f:

      url = f.read().splitlines()
      print( "nTesting URLs.", time.ctime())

      all_text = pool.map(checkUrls,url)
      print("closing p")
      pool.close()
      pool.join()
      #checkUrls()
      print("Press CTRL+C to exit")
      #I don't need this sleep any longer. Can I remove the next line?
      time.sleep(100000) #Sleep 10 seconds

      def checkUrls(url):
      count = 0
      status = "N/A"
      try:
      status = checkUrl(url)
      except requests.exceptions.ConnectionError:
      status = "DOWN"
      except requests.exceptions.HTTPError:
      status = "HttpError"
      except requests.exceptions.ProxyError:
      status = "ProxyError"
      except requests.exceptions.Timeout:
      status = "TimeoutError"
      except requests.exceptions.ConnectTimeout:
      status = "connectTimeout"
      except requests.exceptions.ReadTimeout:
      status = "ReadTimeout"
      except requests.exceptions.TooManyRedirects:
      status = "TooManyRedirects"
      except requests.exceptions.MissingSchema:
      status = "MissingSchema"
      except requests.exceptions.InvalidURL:
      status = "InvalidURL"
      except requests.exceptions.InvalidHeader:
      status = "InvalidHeader"
      except requests.exceptions.URLRequired:
      status = "URLmissing"
      except requests.exceptions.InvalidProxyURL:
      status = "InvalidProxy"
      except requests.exceptions.RetryError:
      status = "RetryError"
      except requests.exceptions.InvalidSchema:
      status = "InvalidSchema"

      printStatus(url, status, count)

      count+=1
      time_elapsed = datetime.now() - start_time


      def checkUrl(url):
      r = requests.get(url, timeout=5)
      #print r.status_code
      return str(r.status_code)


      def printStatus(url, status, count):
      color = GREEN

      count= count+1
      if status != "200":
      color=RED

      #print(color+status+ENDC+' '+ url)
      print(str(count)+'t' + color+status+ENDC+' '+ url)
      file.write(str(count)+'t' + color+status+ENDC+' '+ url +'n')

      #print('Time elapsed (hh:mm:ss.ms) '.format(time_elapsed))

      end = time.time()
      print(end - start)

      # Main app
      #
      if __name__ == '__main__':
      main()








      share|improve this question










      share|improve this question




      share|improve this question









      asked Jan 29 at 16:46









      Stryker

      10315




      10315




















          2 Answers
          2






          active

          oldest

          votes

















          up vote
          1
          down vote



          accepted










          Python has something called the GIL (Global Interface Lock), which restricts the number of thread that can concurrently run to one. This limitation only concerns pure Python code (so modules written in C, like numpy might release this lock).



          Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?



          As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.






          share|improve this answer




























            up vote
            0
            down vote













            Here is what I decided to change the code to this version, which runs a lot faster:



            import urllib.request
            import urllib.error
            import time
            from multiprocessing import Pool

            start = time.time()

            file = open('url10.txt', 'r', encoding="ISO-8859-1")
            urls = file.readlines()

            print(urls)


            def checkurl(url):
            try:
            conn = urllib.request.urlopen(url)
            except urllib.error.HTTPError as e:
            # Return code error (e.g. 404, 501, ...)
            # ...
            print('HTTPError: '.format(e.code) + ', ' + url)
            except urllib.error.URLError as e:
            # Not an HTTP-specific error (e.g. connection refused)
            # ...
            print('URLError: '.format(e.reason) + ', ' + url)
            else:
            # 200
            # ...
            print('good' + ', ' + url)


            if __name__ == "__main__":
            p = Pool(processes=20)
            result = p.map(checkurl, urls)

            print("done in : ", time.time()-start)



            Url.txt file contains a list of urls



            http://yahoo.com
            http://www.google.com


            I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?






            share|improve this answer























              Your Answer




              StackExchange.ifUsing("editor", function ()
              return StackExchange.using("mathjaxEditing", function ()
              StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
              StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
              );
              );
              , "mathjax-editing");

              StackExchange.ifUsing("editor", function ()
              StackExchange.using("externalEditor", function ()
              StackExchange.using("snippets", function ()
              StackExchange.snippets.init();
              );
              );
              , "code-snippets");

              StackExchange.ready(function()
              var channelOptions =
              tags: "".split(" "),
              id: "196"
              ;
              initTagRenderer("".split(" "), "".split(" "), channelOptions);

              StackExchange.using("externalEditor", function()
              // Have to fire editor after snippets, if snippets enabled
              if (StackExchange.settings.snippets.snippetsEnabled)
              StackExchange.using("snippets", function()
              createEditor();
              );

              else
              createEditor();

              );

              function createEditor()
              StackExchange.prepareEditor(
              heartbeatType: 'answer',
              convertImagesToLinks: false,
              noModals: false,
              showLowRepImageUploadWarning: true,
              reputationToPostImages: null,
              bindNavPrevention: true,
              postfix: "",
              onDemand: true,
              discardSelector: ".discard-answer"
              ,immediatelyShowMarkdownHelp:true
              );



              );








               

              draft saved


              draft discarded


















              StackExchange.ready(
              function ()
              StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f186268%2floop-through-a-list-of-urls-multi-threading-and-check-return-code-in-python%23new-answer', 'question_page');

              );

              Post as a guest






























              2 Answers
              2






              active

              oldest

              votes








              2 Answers
              2






              active

              oldest

              votes









              active

              oldest

              votes






              active

              oldest

              votes








              up vote
              1
              down vote



              accepted










              Python has something called the GIL (Global Interface Lock), which restricts the number of thread that can concurrently run to one. This limitation only concerns pure Python code (so modules written in C, like numpy might release this lock).



              Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?



              As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.






              share|improve this answer

























                up vote
                1
                down vote



                accepted










                Python has something called the GIL (Global Interface Lock), which restricts the number of thread that can concurrently run to one. This limitation only concerns pure Python code (so modules written in C, like numpy might release this lock).



                Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?



                As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.






                share|improve this answer























                  up vote
                  1
                  down vote



                  accepted







                  up vote
                  1
                  down vote



                  accepted






                  Python has something called the GIL (Global Interface Lock), which restricts the number of thread that can concurrently run to one. This limitation only concerns pure Python code (so modules written in C, like numpy might release this lock).



                  Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?



                  As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.






                  share|improve this answer













                  Python has something called the GIL (Global Interface Lock), which restricts the number of thread that can concurrently run to one. This limitation only concerns pure Python code (so modules written in C, like numpy might release this lock).



                  Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?



                  As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.







                  share|improve this answer













                  share|improve this answer



                  share|improve this answer











                  answered Jan 29 at 18:06









                  Graipher

                  20.5k43081




                  20.5k43081






















                      up vote
                      0
                      down vote













                      Here is what I decided to change the code to this version, which runs a lot faster:



                      import urllib.request
                      import urllib.error
                      import time
                      from multiprocessing import Pool

                      start = time.time()

                      file = open('url10.txt', 'r', encoding="ISO-8859-1")
                      urls = file.readlines()

                      print(urls)


                      def checkurl(url):
                      try:
                      conn = urllib.request.urlopen(url)
                      except urllib.error.HTTPError as e:
                      # Return code error (e.g. 404, 501, ...)
                      # ...
                      print('HTTPError: '.format(e.code) + ', ' + url)
                      except urllib.error.URLError as e:
                      # Not an HTTP-specific error (e.g. connection refused)
                      # ...
                      print('URLError: '.format(e.reason) + ', ' + url)
                      else:
                      # 200
                      # ...
                      print('good' + ', ' + url)


                      if __name__ == "__main__":
                      p = Pool(processes=20)
                      result = p.map(checkurl, urls)

                      print("done in : ", time.time()-start)



                      Url.txt file contains a list of urls



                      http://yahoo.com
                      http://www.google.com


                      I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?






                      share|improve this answer



























                        up vote
                        0
                        down vote













                        Here is what I decided to change the code to this version, which runs a lot faster:



                        import urllib.request
                        import urllib.error
                        import time
                        from multiprocessing import Pool

                        start = time.time()

                        file = open('url10.txt', 'r', encoding="ISO-8859-1")
                        urls = file.readlines()

                        print(urls)


                        def checkurl(url):
                        try:
                        conn = urllib.request.urlopen(url)
                        except urllib.error.HTTPError as e:
                        # Return code error (e.g. 404, 501, ...)
                        # ...
                        print('HTTPError: '.format(e.code) + ', ' + url)
                        except urllib.error.URLError as e:
                        # Not an HTTP-specific error (e.g. connection refused)
                        # ...
                        print('URLError: '.format(e.reason) + ', ' + url)
                        else:
                        # 200
                        # ...
                        print('good' + ', ' + url)


                        if __name__ == "__main__":
                        p = Pool(processes=20)
                        result = p.map(checkurl, urls)

                        print("done in : ", time.time()-start)



                        Url.txt file contains a list of urls



                        http://yahoo.com
                        http://www.google.com


                        I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?






                        share|improve this answer

























                          up vote
                          0
                          down vote










                          up vote
                          0
                          down vote









                          Here is what I decided to change the code to this version, which runs a lot faster:



                          import urllib.request
                          import urllib.error
                          import time
                          from multiprocessing import Pool

                          start = time.time()

                          file = open('url10.txt', 'r', encoding="ISO-8859-1")
                          urls = file.readlines()

                          print(urls)


                          def checkurl(url):
                          try:
                          conn = urllib.request.urlopen(url)
                          except urllib.error.HTTPError as e:
                          # Return code error (e.g. 404, 501, ...)
                          # ...
                          print('HTTPError: '.format(e.code) + ', ' + url)
                          except urllib.error.URLError as e:
                          # Not an HTTP-specific error (e.g. connection refused)
                          # ...
                          print('URLError: '.format(e.reason) + ', ' + url)
                          else:
                          # 200
                          # ...
                          print('good' + ', ' + url)


                          if __name__ == "__main__":
                          p = Pool(processes=20)
                          result = p.map(checkurl, urls)

                          print("done in : ", time.time()-start)



                          Url.txt file contains a list of urls



                          http://yahoo.com
                          http://www.google.com


                          I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?






                          share|improve this answer















                          Here is what I decided to change the code to this version, which runs a lot faster:



                          import urllib.request
                          import urllib.error
                          import time
                          from multiprocessing import Pool

                          start = time.time()

                          file = open('url10.txt', 'r', encoding="ISO-8859-1")
                          urls = file.readlines()

                          print(urls)


                          def checkurl(url):
                          try:
                          conn = urllib.request.urlopen(url)
                          except urllib.error.HTTPError as e:
                          # Return code error (e.g. 404, 501, ...)
                          # ...
                          print('HTTPError: '.format(e.code) + ', ' + url)
                          except urllib.error.URLError as e:
                          # Not an HTTP-specific error (e.g. connection refused)
                          # ...
                          print('URLError: '.format(e.reason) + ', ' + url)
                          else:
                          # 200
                          # ...
                          print('good' + ', ' + url)


                          if __name__ == "__main__":
                          p = Pool(processes=20)
                          result = p.map(checkurl, urls)

                          print("done in : ", time.time()-start)



                          Url.txt file contains a list of urls



                          http://yahoo.com
                          http://www.google.com


                          I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?







                          share|improve this answer















                          share|improve this answer



                          share|improve this answer








                          edited Jan 30 at 10:45









                          Graipher

                          20.5k43081




                          20.5k43081











                          answered Jan 29 at 20:51









                          Stryker

                          10315




                          10315






















                               

                              draft saved


                              draft discarded


























                               


                              draft saved


                              draft discarded














                              StackExchange.ready(
                              function ()
                              StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f186268%2floop-through-a-list-of-urls-multi-threading-and-check-return-code-in-python%23new-answer', 'question_page');

                              );

                              Post as a guest













































































                              Popular posts from this blog

                              Python Lists

                              Aion

                              JavaScript Array Iteration Methods