Loop through a list of URLs multi-threading and check return code in python

Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
0
down vote
favorite
I have to loop through a list of over 4000 urls and check their http return code in python.
Url.txt: Contains a list of 4000 urls with one url per line.
The script takes a long time to run and I wanted to incorporate multi-threading to improve speed but not sure if I have done it properly.
It sure doesn't seem like it is working fast enough.
#! /usr/bin/python
# To just check a site and get the URL code
#import urllib.request
#print(urllib.request.urlopen("http://www.stackoverflow.com").getcode())
#############################################################################
import time
import requests
start = time.time()
from multiprocessing.dummy import Pool
pool = Pool(8) # Number of concurrent threads
#input file
URLS = open("url.txt","r")
#output file
file = open('output.csv', 'w')
#############################################################################
GREEN = '33[92m'
YELLOW = '33[93m'
RED = '33[91m'
ENDC = '33[0m'
def main():
with open('url.txt') as f:
url = f.read().splitlines()
print( "nTesting URLs.", time.ctime())
all_text = pool.map(checkUrls,url)
print("closing p")
pool.close()
pool.join()
#checkUrls()
print("Press CTRL+C to exit")
#I don't need this sleep any longer. Can I remove the next line?
time.sleep(100000) #Sleep 10 seconds
def checkUrls(url):
count = 0
status = "N/A"
try:
status = checkUrl(url)
except requests.exceptions.ConnectionError:
status = "DOWN"
except requests.exceptions.HTTPError:
status = "HttpError"
except requests.exceptions.ProxyError:
status = "ProxyError"
except requests.exceptions.Timeout:
status = "TimeoutError"
except requests.exceptions.ConnectTimeout:
status = "connectTimeout"
except requests.exceptions.ReadTimeout:
status = "ReadTimeout"
except requests.exceptions.TooManyRedirects:
status = "TooManyRedirects"
except requests.exceptions.MissingSchema:
status = "MissingSchema"
except requests.exceptions.InvalidURL:
status = "InvalidURL"
except requests.exceptions.InvalidHeader:
status = "InvalidHeader"
except requests.exceptions.URLRequired:
status = "URLmissing"
except requests.exceptions.InvalidProxyURL:
status = "InvalidProxy"
except requests.exceptions.RetryError:
status = "RetryError"
except requests.exceptions.InvalidSchema:
status = "InvalidSchema"
printStatus(url, status, count)
count+=1
time_elapsed = datetime.now() - start_time
def checkUrl(url):
r = requests.get(url, timeout=5)
#print r.status_code
return str(r.status_code)
def printStatus(url, status, count):
color = GREEN
count= count+1
if status != "200":
color=RED
#print(color+status+ENDC+' '+ url)
print(str(count)+'t' + color+status+ENDC+' '+ url)
file.write(str(count)+'t' + color+status+ENDC+' '+ url +'n')
#print('Time elapsed (hh:mm:ss.ms) '.format(time_elapsed))
end = time.time()
print(end - start)
# Main app
#
if __name__ == '__main__':
main()
python python-3.x multithreading multiprocessing
add a comment |Â
up vote
0
down vote
favorite
I have to loop through a list of over 4000 urls and check their http return code in python.
Url.txt: Contains a list of 4000 urls with one url per line.
The script takes a long time to run and I wanted to incorporate multi-threading to improve speed but not sure if I have done it properly.
It sure doesn't seem like it is working fast enough.
#! /usr/bin/python
# To just check a site and get the URL code
#import urllib.request
#print(urllib.request.urlopen("http://www.stackoverflow.com").getcode())
#############################################################################
import time
import requests
start = time.time()
from multiprocessing.dummy import Pool
pool = Pool(8) # Number of concurrent threads
#input file
URLS = open("url.txt","r")
#output file
file = open('output.csv', 'w')
#############################################################################
GREEN = '33[92m'
YELLOW = '33[93m'
RED = '33[91m'
ENDC = '33[0m'
def main():
with open('url.txt') as f:
url = f.read().splitlines()
print( "nTesting URLs.", time.ctime())
all_text = pool.map(checkUrls,url)
print("closing p")
pool.close()
pool.join()
#checkUrls()
print("Press CTRL+C to exit")
#I don't need this sleep any longer. Can I remove the next line?
time.sleep(100000) #Sleep 10 seconds
def checkUrls(url):
count = 0
status = "N/A"
try:
status = checkUrl(url)
except requests.exceptions.ConnectionError:
status = "DOWN"
except requests.exceptions.HTTPError:
status = "HttpError"
except requests.exceptions.ProxyError:
status = "ProxyError"
except requests.exceptions.Timeout:
status = "TimeoutError"
except requests.exceptions.ConnectTimeout:
status = "connectTimeout"
except requests.exceptions.ReadTimeout:
status = "ReadTimeout"
except requests.exceptions.TooManyRedirects:
status = "TooManyRedirects"
except requests.exceptions.MissingSchema:
status = "MissingSchema"
except requests.exceptions.InvalidURL:
status = "InvalidURL"
except requests.exceptions.InvalidHeader:
status = "InvalidHeader"
except requests.exceptions.URLRequired:
status = "URLmissing"
except requests.exceptions.InvalidProxyURL:
status = "InvalidProxy"
except requests.exceptions.RetryError:
status = "RetryError"
except requests.exceptions.InvalidSchema:
status = "InvalidSchema"
printStatus(url, status, count)
count+=1
time_elapsed = datetime.now() - start_time
def checkUrl(url):
r = requests.get(url, timeout=5)
#print r.status_code
return str(r.status_code)
def printStatus(url, status, count):
color = GREEN
count= count+1
if status != "200":
color=RED
#print(color+status+ENDC+' '+ url)
print(str(count)+'t' + color+status+ENDC+' '+ url)
file.write(str(count)+'t' + color+status+ENDC+' '+ url +'n')
#print('Time elapsed (hh:mm:ss.ms) '.format(time_elapsed))
end = time.time()
print(end - start)
# Main app
#
if __name__ == '__main__':
main()
python python-3.x multithreading multiprocessing
add a comment |Â
up vote
0
down vote
favorite
up vote
0
down vote
favorite
I have to loop through a list of over 4000 urls and check their http return code in python.
Url.txt: Contains a list of 4000 urls with one url per line.
The script takes a long time to run and I wanted to incorporate multi-threading to improve speed but not sure if I have done it properly.
It sure doesn't seem like it is working fast enough.
#! /usr/bin/python
# To just check a site and get the URL code
#import urllib.request
#print(urllib.request.urlopen("http://www.stackoverflow.com").getcode())
#############################################################################
import time
import requests
start = time.time()
from multiprocessing.dummy import Pool
pool = Pool(8) # Number of concurrent threads
#input file
URLS = open("url.txt","r")
#output file
file = open('output.csv', 'w')
#############################################################################
GREEN = '33[92m'
YELLOW = '33[93m'
RED = '33[91m'
ENDC = '33[0m'
def main():
with open('url.txt') as f:
url = f.read().splitlines()
print( "nTesting URLs.", time.ctime())
all_text = pool.map(checkUrls,url)
print("closing p")
pool.close()
pool.join()
#checkUrls()
print("Press CTRL+C to exit")
#I don't need this sleep any longer. Can I remove the next line?
time.sleep(100000) #Sleep 10 seconds
def checkUrls(url):
count = 0
status = "N/A"
try:
status = checkUrl(url)
except requests.exceptions.ConnectionError:
status = "DOWN"
except requests.exceptions.HTTPError:
status = "HttpError"
except requests.exceptions.ProxyError:
status = "ProxyError"
except requests.exceptions.Timeout:
status = "TimeoutError"
except requests.exceptions.ConnectTimeout:
status = "connectTimeout"
except requests.exceptions.ReadTimeout:
status = "ReadTimeout"
except requests.exceptions.TooManyRedirects:
status = "TooManyRedirects"
except requests.exceptions.MissingSchema:
status = "MissingSchema"
except requests.exceptions.InvalidURL:
status = "InvalidURL"
except requests.exceptions.InvalidHeader:
status = "InvalidHeader"
except requests.exceptions.URLRequired:
status = "URLmissing"
except requests.exceptions.InvalidProxyURL:
status = "InvalidProxy"
except requests.exceptions.RetryError:
status = "RetryError"
except requests.exceptions.InvalidSchema:
status = "InvalidSchema"
printStatus(url, status, count)
count+=1
time_elapsed = datetime.now() - start_time
def checkUrl(url):
r = requests.get(url, timeout=5)
#print r.status_code
return str(r.status_code)
def printStatus(url, status, count):
color = GREEN
count= count+1
if status != "200":
color=RED
#print(color+status+ENDC+' '+ url)
print(str(count)+'t' + color+status+ENDC+' '+ url)
file.write(str(count)+'t' + color+status+ENDC+' '+ url +'n')
#print('Time elapsed (hh:mm:ss.ms) '.format(time_elapsed))
end = time.time()
print(end - start)
# Main app
#
if __name__ == '__main__':
main()
python python-3.x multithreading multiprocessing
I have to loop through a list of over 4000 urls and check their http return code in python.
Url.txt: Contains a list of 4000 urls with one url per line.
The script takes a long time to run and I wanted to incorporate multi-threading to improve speed but not sure if I have done it properly.
It sure doesn't seem like it is working fast enough.
#! /usr/bin/python
# To just check a site and get the URL code
#import urllib.request
#print(urllib.request.urlopen("http://www.stackoverflow.com").getcode())
#############################################################################
import time
import requests
start = time.time()
from multiprocessing.dummy import Pool
pool = Pool(8) # Number of concurrent threads
#input file
URLS = open("url.txt","r")
#output file
file = open('output.csv', 'w')
#############################################################################
GREEN = '33[92m'
YELLOW = '33[93m'
RED = '33[91m'
ENDC = '33[0m'
def main():
with open('url.txt') as f:
url = f.read().splitlines()
print( "nTesting URLs.", time.ctime())
all_text = pool.map(checkUrls,url)
print("closing p")
pool.close()
pool.join()
#checkUrls()
print("Press CTRL+C to exit")
#I don't need this sleep any longer. Can I remove the next line?
time.sleep(100000) #Sleep 10 seconds
def checkUrls(url):
count = 0
status = "N/A"
try:
status = checkUrl(url)
except requests.exceptions.ConnectionError:
status = "DOWN"
except requests.exceptions.HTTPError:
status = "HttpError"
except requests.exceptions.ProxyError:
status = "ProxyError"
except requests.exceptions.Timeout:
status = "TimeoutError"
except requests.exceptions.ConnectTimeout:
status = "connectTimeout"
except requests.exceptions.ReadTimeout:
status = "ReadTimeout"
except requests.exceptions.TooManyRedirects:
status = "TooManyRedirects"
except requests.exceptions.MissingSchema:
status = "MissingSchema"
except requests.exceptions.InvalidURL:
status = "InvalidURL"
except requests.exceptions.InvalidHeader:
status = "InvalidHeader"
except requests.exceptions.URLRequired:
status = "URLmissing"
except requests.exceptions.InvalidProxyURL:
status = "InvalidProxy"
except requests.exceptions.RetryError:
status = "RetryError"
except requests.exceptions.InvalidSchema:
status = "InvalidSchema"
printStatus(url, status, count)
count+=1
time_elapsed = datetime.now() - start_time
def checkUrl(url):
r = requests.get(url, timeout=5)
#print r.status_code
return str(r.status_code)
def printStatus(url, status, count):
color = GREEN
count= count+1
if status != "200":
color=RED
#print(color+status+ENDC+' '+ url)
print(str(count)+'t' + color+status+ENDC+' '+ url)
file.write(str(count)+'t' + color+status+ENDC+' '+ url +'n')
#print('Time elapsed (hh:mm:ss.ms) '.format(time_elapsed))
end = time.time()
print(end - start)
# Main app
#
if __name__ == '__main__':
main()
python python-3.x multithreading multiprocessing
asked Jan 29 at 16:46
Stryker
10315
10315
add a comment |Â
add a comment |Â
2 Answers
2
active
oldest
votes
up vote
1
down vote
accepted
Python has something called the GIL (Global Interface Lock), which restricts the number of thread that can concurrently run to one. This limitation only concerns pure Python code (so modules written in C, like numpy might release this lock).
Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?
As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.
add a comment |Â
up vote
0
down vote
Here is what I decided to change the code to this version, which runs a lot faster:
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
start = time.time()
file = open('url10.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
print(urls)
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Return code error (e.g. 404, 501, ...)
# ...
print('HTTPError: '.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
print('URLError: '.format(e.reason) + ', ' + url)
else:
# 200
# ...
print('good' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=20)
result = p.map(checkurl, urls)
print("done in : ", time.time()-start)
Url.txt file contains a list of urls
http://yahoo.com
http://www.google.com
I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?
add a comment |Â
2 Answers
2
active
oldest
votes
2 Answers
2
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
1
down vote
accepted
Python has something called the GIL (Global Interface Lock), which restricts the number of thread that can concurrently run to one. This limitation only concerns pure Python code (so modules written in C, like numpy might release this lock).
Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?
As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.
add a comment |Â
up vote
1
down vote
accepted
Python has something called the GIL (Global Interface Lock), which restricts the number of thread that can concurrently run to one. This limitation only concerns pure Python code (so modules written in C, like numpy might release this lock).
Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?
As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.
add a comment |Â
up vote
1
down vote
accepted
up vote
1
down vote
accepted
Python has something called the GIL (Global Interface Lock), which restricts the number of thread that can concurrently run to one. This limitation only concerns pure Python code (so modules written in C, like numpy might release this lock).
Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?
As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.
Python has something called the GIL (Global Interface Lock), which restricts the number of thread that can concurrently run to one. This limitation only concerns pure Python code (so modules written in C, like numpy might release this lock).
Have you tried using multiprocessing.Pool, instead of multiprocessing.dummy.Pool?
As an additional point, Python has an official style-guide, PEP8. It recommends using lower_case for variables and functions.
answered Jan 29 at 18:06
Graipher
20.5k43081
20.5k43081
add a comment |Â
add a comment |Â
up vote
0
down vote
Here is what I decided to change the code to this version, which runs a lot faster:
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
start = time.time()
file = open('url10.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
print(urls)
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Return code error (e.g. 404, 501, ...)
# ...
print('HTTPError: '.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
print('URLError: '.format(e.reason) + ', ' + url)
else:
# 200
# ...
print('good' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=20)
result = p.map(checkurl, urls)
print("done in : ", time.time()-start)
Url.txt file contains a list of urls
http://yahoo.com
http://www.google.com
I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?
add a comment |Â
up vote
0
down vote
Here is what I decided to change the code to this version, which runs a lot faster:
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
start = time.time()
file = open('url10.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
print(urls)
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Return code error (e.g. 404, 501, ...)
# ...
print('HTTPError: '.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
print('URLError: '.format(e.reason) + ', ' + url)
else:
# 200
# ...
print('good' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=20)
result = p.map(checkurl, urls)
print("done in : ", time.time()-start)
Url.txt file contains a list of urls
http://yahoo.com
http://www.google.com
I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?
add a comment |Â
up vote
0
down vote
up vote
0
down vote
Here is what I decided to change the code to this version, which runs a lot faster:
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
start = time.time()
file = open('url10.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
print(urls)
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Return code error (e.g. 404, 501, ...)
# ...
print('HTTPError: '.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
print('URLError: '.format(e.reason) + ', ' + url)
else:
# 200
# ...
print('good' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=20)
result = p.map(checkurl, urls)
print("done in : ", time.time()-start)
Url.txt file contains a list of urls
http://yahoo.com
http://www.google.com
I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?
Here is what I decided to change the code to this version, which runs a lot faster:
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
start = time.time()
file = open('url10.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
print(urls)
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Return code error (e.g. 404, 501, ...)
# ...
print('HTTPError: '.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
print('URLError: '.format(e.reason) + ', ' + url)
else:
# 200
# ...
print('good' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=20)
result = p.map(checkurl, urls)
print("done in : ", time.time()-start)
Url.txt file contains a list of urls
http://yahoo.com
http://www.google.com
I have about a 1000 urls to check and it seems to work. Any suggestions to improve the functionality?
edited Jan 30 at 10:45
Graipher
20.5k43081
20.5k43081
answered Jan 29 at 20:51
Stryker
10315
10315
add a comment |Â
add a comment |Â
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f186268%2floop-through-a-list-of-urls-multi-threading-and-check-return-code-in-python%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password