Loop through a list of URLs multi-threading and check return code in python

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
0
down vote

favorite

I have to loop through a list of over 4000 urls and check their http return code in python.

Url.txt: Contains a list of 4000 urls with one url per line.

The script takes a long time to run and I wanted to incorporate multi-threading to improve speed but not sure if I have done it properly.

It sure doesn't seem like it is working fast enough.

#! /usr/bin/python

# To just check a site and get the URL code
#import urllib.request
#print(urllib.request.urlopen("http://www.stackoverflow.com").getcode())
#############################################################################

import time
import requests

start = time.time()

from multiprocessing.dummy import Pool
pool = Pool(8) # Number of concurrent threads

#input file
URLS = open("url.txt","r")

#output file
file = open('output.csv', 'w') 

#############################################################################

GREEN = '33[92m'
YELLOW = '33[93m'
RED = '33[91m'
ENDC = '33[0m'


def main():
 with open('url.txt') as f:

 url = f.read().splitlines()
 print( "nTesting URLs.", time.ctime())

 all_text = pool.map(checkUrls,url)
 print("closing p")
 pool.close()
 pool.join()
 #checkUrls()
 print("Press CTRL+C to exit")
 #I don't need this sleep any longer. Can I remove the next line?
 time.sleep(100000) #Sleep 10 seconds

def checkUrls(url):
 count = 0
 status = "N/A"
 try:
 status = checkUrl(url)
 except requests.exceptions.ConnectionError:
 status = "DOWN"
 except requests.exceptions.HTTPError:
 status = "HttpError"
 except requests.exceptions.ProxyError:
 status = "ProxyError"
 except requests.exceptions.Timeout:
 status = "TimeoutError"
 except requests.exceptions.ConnectTimeout:
 status = "connectTimeout" 
 except requests.exceptions.ReadTimeout:
 status = "ReadTimeout" 
 except requests.exceptions.TooManyRedirects:
 status = "TooManyRedirects" 
 except requests.exceptions.MissingSchema:
 status = "MissingSchema" 
 except requests.exceptions.InvalidURL:
 status = "InvalidURL" 
 except requests.exceptions.InvalidHeader:
 status = "InvalidHeader" 
 except requests.exceptions.URLRequired:
 status = "URLmissing" 
 except requests.exceptions.InvalidProxyURL:
 status = "InvalidProxy" 
 except requests.exceptions.RetryError:
 status = "RetryError" 
 except requests.exceptions.InvalidSchema:
 status = "InvalidSchema" 

 printStatus(url, status, count)

 count+=1
 time_elapsed = datetime.now() - start_time


def checkUrl(url):
 r = requests.get(url, timeout=5)
 #print r.status_code
 return str(r.status_code)


def printStatus(url, status, count):
 color = GREEN

 count= count+1
 if status != "200":
 color=RED

 #print(color+status+ENDC+' '+ url)
 print(str(count)+'t' + color+status+ENDC+' '+ url)
 file.write(str(count)+'t' + color+status+ENDC+' '+ url +'n')

 #print('Time elapsed (hh:mm:ss.ms) '.format(time_elapsed)) 

end = time.time()
print(end - start) 

# Main app
#
if __name__ == '__main__':
 main()

asked Jan 29 at 16:46

Stryker

10315

add a commentÂ |Â

up vote
0
down vote

favorite

I have to loop through a list of over 4000 urls and check their http return code in python.

Url.txt: Contains a list of 4000 urls with one url per line.

The script takes a long time to run and I wanted to incorporate multi-threading to improve speed but not sure if I have done it properly.

It sure doesn't seem like it is working fast enough.

#! /usr/bin/python

# To just check a site and get the URL code
#import urllib.request
#print(urllib.request.urlopen("http://www.stackoverflow.com").getcode())
#############################################################################

import time
import requests

start = time.time()

from multiprocessing.dummy import Pool
pool = Pool(8) # Number of concurrent threads

#input file
URLS = open("url.txt","r")

#output file
file = open('output.csv', 'w') 

#############################################################################

GREEN = '33[92m'
YELLOW = '33[93m'
RED = '33[91m'
ENDC = '33[0m'


def main():
 with open('url.txt') as f:

 url = f.read().splitlines()
 print( "nTesting URLs.", time.ctime())

 all_text = pool.map(checkUrls,url)
 print("closing p")
 pool.close()
 pool.join()
 #checkUrls()
 print("Press CTRL+C to exit")
 #I don't need this sleep any longer. Can I remove the next line?
 time.sleep(100000) #Sleep 10 seconds

def checkUrls(url):
 count = 0
 status = "N/A"
 try:
 status = checkUrl(url)
 except requests.exceptions.ConnectionError:
 status = "DOWN"
 except requests.exceptions.HTTPError:
 status = "HttpError"
 except requests.exceptions.ProxyError:
 status = "ProxyError"
 except requests.exceptions.Timeout:
 status = "TimeoutError"
 except requests.exceptions.ConnectTimeout:
 status = "connectTimeout" 
 except requests.exceptions.ReadTimeout:
 status = "ReadTimeout" 
 except requests.exceptions.TooManyRedirects:
 status = "TooManyRedirects" 
 except requests.exceptions.MissingSchema:
 status = "MissingSchema" 
 except requests.exceptions.InvalidURL:
 status = "InvalidURL" 
 except requests.exceptions.InvalidHeader:
 status = "InvalidHeader" 
 except requests.exceptions.URLRequired:
 status = "URLmissing" 
 except requests.exceptions.InvalidProxyURL:
 status = "InvalidProxy" 
 except requests.exceptions.RetryError:
 status = "RetryError" 
 except requests.exceptions.InvalidSchema:
 status = "InvalidSchema" 

 printStatus(url, status, count)

 count+=1
 time_elapsed = datetime.now() - start_time


def checkUrl(url):
 r = requests.get(url, timeout=5)
 #print r.status_code
 return str(r.status_code)


def printStatus(url, status, count):
 color = GREEN

 count= count+1
 if status != "200":
 color=RED

 #print(color+status+ENDC+' '+ url)
 print(str(count)+'t' + color+status+ENDC+' '+ url)
 file.write(str(count)+'t' + color+status+ENDC+' '+ url +'n')

 #print('Time elapsed (hh:mm:ss.ms) '.format(time_elapsed)) 

end = time.time()
print(end - start) 

# Main app
#
if __name__ == '__main__':
 main()

asked Jan 29 at 16:46

Stryker

10315

add a commentÂ |Â

up vote
0
down vote

favorite

I have to loop through a list of over 4000 urls and check their http return code in python.

Url.txt: Contains a list of 4000 urls with one url per line.

The script takes a long time to run and I wanted to incorporate multi-threading to improve speed but not sure if I have done it properly.

It sure doesn't seem like it is working fast enough.

#! /usr/bin/python

# To just check a site and get the URL code
#import urllib.request
#print(urllib.request.urlopen("http://www.stackoverflow.com").getcode())
#############################################################################

import time
import requests

start = time.time()

from multiprocessing.dummy import Pool
pool = Pool(8) # Number of concurrent threads

#input file
URLS = open("url.txt","r")

#output file
file = open('output.csv', 'w') 

#############################################################################

GREEN = '33[92m'
YELLOW = '33[93m'
RED = '33[91m'
ENDC = '33[0m'


def main():
 with open('url.txt') as f:

 url = f.read().splitlines()
 print( "nTesting URLs.", time.ctime())

 all_text = pool.map(checkUrls,url)
 print("closing p")
 pool.close()
 pool.join()
 #checkUrls()
 print("Press CTRL+C to exit")
 #I don't need this sleep any longer. Can I remove the next line?
 time.sleep(100000) #Sleep 10 seconds

def checkUrls(url):
 count = 0
 status = "N/A"
 try:
 status = checkUrl(url)
 except requests.exceptions.ConnectionError:
 status = "DOWN"
 except requests.exceptions.HTTPError:
 status = "HttpError"
 except requests.exceptions.ProxyError:
 status = "ProxyError"
 except requests.exceptions.Timeout:
 status = "TimeoutError"
 except requests.exceptions.ConnectTimeout:
 status = "connectTimeout" 
 except requests.exceptions.ReadTimeout:
 status = "ReadTimeout" 
 except requests.exceptions.TooManyRedirects:
 status = "TooManyRedirects" 
 except requests.exceptions.MissingSchema:
 status = "MissingSchema" 
 except requests.exceptions.InvalidURL:
 status = "InvalidURL" 
 except requests.exceptions.InvalidHeader:
 status = "InvalidHeader" 
 except requests.exceptions.URLRequired:
 status = "URLmissing" 
 except requests.exceptions.InvalidProxyURL:
 status = "InvalidProxy" 
 except requests.exceptions.RetryError:
 status = "RetryError" 
 except requests.exceptions.InvalidSchema:
 status = "InvalidSchema" 

 printStatus(url, status, count)

 count+=1
 time_elapsed = datetime.now() - start_time


def checkUrl(url):
 r = requests.get(url, timeout=5)
 #print r.status_code
 return str(r.status_code)


def printStatus(url, status, count):
 color = GREEN

 count= count+1
 if status != "200":
 color=RED

 #print(color+status+ENDC+' '+ url)
 print(str(count)+'t' + color+status+ENDC+' '+ url)
 file.write(str(count)+'t' + color+status+ENDC+' '+ url +'n')

 #print('Time elapsed (hh:mm:ss.ms) '.format(time_elapsed)) 

end = time.time()
print(end - start) 

# Main app
#
if __name__ == '__main__':
 main()

asked Jan 29 at 16:46

Stryker

10315

I have to loop through a list of over 4000 urls and check their http return code in python.

Url.txt: Contains a list of 4000 urls with one url per line.

The script takes a long time to run and I wanted to incorporate multi-threading to improve speed but not sure if I have done it properly.

It sure doesn't seem like it is working fast enough.

#! /usr/bin/python

# To just check a site and get the URL code
#import urllib.request
#print(urllib.request.urlopen("http://www.stackoverflow.com").getcode())
#############################################################################

import time
import requests

start = time.time()

from multiprocessing.dummy import Pool
pool = Pool(8) # Number of concurrent threads

#input file
URLS = open("url.txt","r")

#output file
file = open('output.csv', 'w') 

#############################################################################

GREEN = '33[92m'
YELLOW = '33[93m'
RED = '33[91m'
ENDC = '33[0m'


def main():
 with open('url.txt') as f:

 url = f.read().splitlines()
 print( "nTesting URLs.", time.ctime())

 all_text = pool.map(checkUrls,url)
 print("closing p")
 pool.close()
 pool.join()
 #checkUrls()
 print("Press CTRL+C to exit")
 #I don't need this sleep any longer. Can I remove the next line?
 time.sleep(100000) #Sleep 10 seconds

def checkUrls(url):
 count = 0
 status = "N/A"
 try:
 status = checkUrl(url)
 except requests.exceptions.ConnectionError:
 status = "DOWN"
 except requests.exceptions.HTTPError:
 status = "HttpError"
 except requests.exceptions.ProxyError:
 status = "ProxyError"
 except requests.exceptions.Timeout:
 status = "TimeoutError"
 except requests.exceptions.ConnectTimeout:
 status = "connectTimeout" 
 except requests.exceptions.ReadTimeout:
 status = "ReadTimeout" 
 except requests.exceptions.TooManyRedirects:
 status = "TooManyRedirects" 
 except requests.exceptions.MissingSchema:
 status = "MissingSchema" 
 except requests.exceptions.InvalidURL:
 status = "InvalidURL" 
 except requests.exceptions.InvalidHeader:
 status = "InvalidHeader" 
 except requests.exceptions.URLRequired:
 status = "URLmissing" 
 except requests.exceptions.InvalidProxyURL:
 status = "InvalidProxy" 
 except requests.exceptions.RetryError:
 status = "RetryError" 
 except requests.exceptions.InvalidSchema:
 status = "InvalidSchema" 

 printStatus(url, status, count)

 count+=1
 time_elapsed = datetime.now() - start_time


def checkUrl(url):
 r = requests.get(url, timeout=5)
 #print r.status_code
 return str(r.status_code)


def printStatus(url, status, count):
 color = GREEN

 count= count+1
 if status != "200":
 color=RED

 #print(color+status+ENDC+' '+ url)
 print(str(count)+'t' + color+status+ENDC+' '+ url)
 file.write(str(count)+'t' + color+status+ENDC+' '+ url +'n')

 #print('Time elapsed (hh:mm:ss.ms) '.format(time_elapsed)) 

end = time.time()
print(end - start) 

# Main app
#
if __name__ == '__main__':
 main()

asked Jan 29 at 16:46

Stryker

10315

asked Jan 29 at 16:46

Stryker

10315

asked Jan 29 at 16:46

Stryker

10315

asked Jan 29 at 16:46

Stryker

10315

add a commentÂ |Â

2 Answers
2

active

oldest

votes

up vote
1
down vote

accepted

Python has something called the GIL (Global Interface Lock), which restricts the number of thread that can concurrently run to one. This limitation only concerns pure Python code (so modules written in C, like numpy might release this lock).

Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?

As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.

answered Jan 29 at 18:06

Graipher

20.5k43081

add a commentÂ |Â

up vote
0
down vote

Here is what I decided to change the code to this version, which runs a lot faster:

import urllib.request
import urllib.error
import time
from multiprocessing import Pool

start = time.time()

file = open('url10.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()

print(urls)


def checkurl(url):
 try:
 conn = urllib.request.urlopen(url)
 except urllib.error.HTTPError as e:
 # Return code error (e.g. 404, 501, ...)
 # ...
 print('HTTPError: '.format(e.code) + ', ' + url)
 except urllib.error.URLError as e:
 # Not an HTTP-specific error (e.g. connection refused)
 # ...
 print('URLError: '.format(e.reason) + ', ' + url)
 else:
 # 200
 # ...
 print('good' + ', ' + url)


if __name__ == "__main__":
 p = Pool(processes=20)
 result = p.map(checkurl, urls)

print("done in : ", time.time()-start)

Url.txt file contains a list of urls

http://yahoo.com
http://www.google.com

I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?

edited Jan 30 at 10:45

Graipher

20.5k43081

answered Jan 29 at 20:51

Stryker

10315

add a commentÂ |Â

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f186268%2floop-through-a-list-of-urls-multi-threading-and-check-return-code-in-python%23new-answer', 'question_page');

);

Post as a guest

Name

2 Answers
2

active

oldest

votes

2 Answers
2

active

oldest

votes

up vote
1
down vote

accepted

Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?

As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.

answered Jan 29 at 18:06

Graipher

20.5k43081

add a commentÂ |Â

up vote
1
down vote

accepted

Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?

As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.

answered Jan 29 at 18:06

Graipher

20.5k43081

add a commentÂ |Â

up vote
1
down vote

accepted

Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?

As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.

answered Jan 29 at 18:06

Graipher

20.5k43081

Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?

As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.

answered Jan 29 at 18:06

Graipher

20.5k43081

answered Jan 29 at 18:06

Graipher

20.5k43081

answered Jan 29 at 18:06

Graipher

20.5k43081

answered Jan 29 at 18:06

Graipher

20.5k43081

add a commentÂ |Â

up vote
0
down vote

Here is what I decided to change the code to this version, which runs a lot faster:

import urllib.request
import urllib.error
import time
from multiprocessing import Pool

start = time.time()

file = open('url10.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()

print(urls)


def checkurl(url):
 try:
 conn = urllib.request.urlopen(url)
 except urllib.error.HTTPError as e:
 # Return code error (e.g. 404, 501, ...)
 # ...
 print('HTTPError: '.format(e.code) + ', ' + url)
 except urllib.error.URLError as e:
 # Not an HTTP-specific error (e.g. connection refused)
 # ...
 print('URLError: '.format(e.reason) + ', ' + url)
 else:
 # 200
 # ...
 print('good' + ', ' + url)


if __name__ == "__main__":
 p = Pool(processes=20)
 result = p.map(checkurl, urls)

print("done in : ", time.time()-start)

Url.txt file contains a list of urls

http://yahoo.com
http://www.google.com

I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?

edited Jan 30 at 10:45

Graipher

20.5k43081

answered Jan 29 at 20:51

Stryker

10315

add a commentÂ |Â

up vote
0
down vote

Here is what I decided to change the code to this version, which runs a lot faster:

import urllib.request
import urllib.error
import time
from multiprocessing import Pool

start = time.time()

file = open('url10.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()

print(urls)


def checkurl(url):
 try:
 conn = urllib.request.urlopen(url)
 except urllib.error.HTTPError as e:
 # Return code error (e.g. 404, 501, ...)
 # ...
 print('HTTPError: '.format(e.code) + ', ' + url)
 except urllib.error.URLError as e:
 # Not an HTTP-specific error (e.g. connection refused)
 # ...
 print('URLError: '.format(e.reason) + ', ' + url)
 else:
 # 200
 # ...
 print('good' + ', ' + url)


if __name__ == "__main__":
 p = Pool(processes=20)
 result = p.map(checkurl, urls)

print("done in : ", time.time()-start)

Url.txt file contains a list of urls

http://yahoo.com
http://www.google.com

I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?

edited Jan 30 at 10:45

Graipher

20.5k43081

answered Jan 29 at 20:51

Stryker

10315

add a commentÂ |Â

up vote
0
down vote

Here is what I decided to change the code to this version, which runs a lot faster:

import urllib.request
import urllib.error
import time
from multiprocessing import Pool

start = time.time()

file = open('url10.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()

print(urls)


def checkurl(url):
 try:
 conn = urllib.request.urlopen(url)
 except urllib.error.HTTPError as e:
 # Return code error (e.g. 404, 501, ...)
 # ...
 print('HTTPError: '.format(e.code) + ', ' + url)
 except urllib.error.URLError as e:
 # Not an HTTP-specific error (e.g. connection refused)
 # ...
 print('URLError: '.format(e.reason) + ', ' + url)
 else:
 # 200
 # ...
 print('good' + ', ' + url)


if __name__ == "__main__":
 p = Pool(processes=20)
 result = p.map(checkurl, urls)

print("done in : ", time.time()-start)

Url.txt file contains a list of urls

http://yahoo.com
http://www.google.com

I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?

edited Jan 30 at 10:45

Graipher

20.5k43081

answered Jan 29 at 20:51

Stryker

10315

Here is what I decided to change the code to this version, which runs a lot faster:

import urllib.request
import urllib.error
import time
from multiprocessing import Pool

start = time.time()

file = open('url10.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()

print(urls)


def checkurl(url):
 try:
 conn = urllib.request.urlopen(url)
 except urllib.error.HTTPError as e:
 # Return code error (e.g. 404, 501, ...)
 # ...
 print('HTTPError: '.format(e.code) + ', ' + url)
 except urllib.error.URLError as e:
 # Not an HTTP-specific error (e.g. connection refused)
 # ...
 print('URLError: '.format(e.reason) + ', ' + url)
 else:
 # 200
 # ...
 print('good' + ', ' + url)


if __name__ == "__main__":
 p = Pool(processes=20)
 result = p.map(checkurl, urls)

print("done in : ", time.time()-start)

Url.txt file contains a list of urls

http://yahoo.com
http://www.google.com

I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?

edited Jan 30 at 10:45

Graipher

20.5k43081

answered Jan 29 at 20:51

Stryker

10315

edited Jan 30 at 10:45

Graipher

20.5k43081

edited Jan 30 at 10:45

Graipher

20.5k43081

edited Jan 30 at 10:45

Graipher

20.5k43081

answered Jan 29 at 20:51

Stryker

10315

answered Jan 29 at 20:51

Stryker

10315

answered Jan 29 at 20:51

Stryker

10315

add a commentÂ |Â

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr