Wikipedia Random Page in Category Bot
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
5
down vote
favorite
I recently wrote a Python script to generate a random page within a Wikipedia category and its subcategories:
"""Generate a random page from a wikipedia category."""
import argparse
import random
import requests
DEBUGGING = False
max_depth = 4
current_depth = 0
header = "Garrett Credi's Random Page Bot(Contact @ gcc@ameritech.net)"
headerVal = 'Api-User-Agent': header
base_url = 'https://en.wikipedia.org/w/api.php'
def print_debug(str):
"""Print strings if in debug/verbose mode mode."""
global DEBUGGING
if(DEBUGGING):
print("DEBUG: " + str)
def generateRequestsParams(category, mode):
"""Generate the params for requests given a category and a mode."""
cmtype = ""
if(mode == "Subcat"):
cmtype = 'subcat'
elif(mode == "Subpage"):
cmtype = 'page'
params =
'format': 'json',
'action': 'query',
'list': 'categorymembers',
'cmtitle': category,
'cmlimit': 500,
'cmtype': cmtype
if(mode == "Pagecats"):
params =
'format': 'json',
'action': 'query',
'titles': category,
'prop': 'categories'
return params
def wrappedRequest(category, mode):
"""Wrap a request to deal with connection errors."""
global base_url
params = generateRequestsParams(category, mode)
global headerVal
max_times = 5
times = 0
propertyString = 'categorymembers'
while(times < max_times):
try:
r = requests.get(base_url, headers=headerVal, params=params)
if(mode != "Pagecats"):
return r.json()['query'][propertyString]
else:
for key in r.json()['query']['pages']:
return r.json()['query']['pages'][key]['categories']
except requests.exceptions.ConnectionError as e:
if(times > max_times):
print_debug("category failed too many times (times) " +
" times. Moving on".format(
category=category,
times=times
)
)
times = 0
return [category]
else:
print_debug("Retrying category due to connection " +
" error".format(
cateogry=category
)
)
times += 1
def getSubcategories(category):
"""Get subcategories of a given subcategory."""
global max_depth, DEBUGGING
current_depth = 1
singleStepSubcategories = [category]
allSubcategories =
while(current_depth <= max_depth):
print_debug("Current tree depth d".format(d=current_depth))
subcategoryTemp =
if(len(singleStepSubcategories) == 0):
break
for subcat in singleStepSubcategories:
allSubcategories.append(subcat)
subcategories = wrappedRequest(subcat, mode="Subcat")
for cat in subcategories:
title = cat['title']
print_debug("subcat has subcategory title".format(
subcat=subcat,
title=title
)
)
if(title not in allSubcategories):
allSubcategories.append(title)
subcategoryTemp.append(title)
else:
print_debug("t already checked. Moving on".format(
t=title
)
)
singleStepSubcategories = subcategoryTemp
current_depth += 1
return allSubcategories
def saveArray(category, subcats):
"""Save array to file."""
filename = "category_subcats.txt".format(category=category)
print_debug("Saving to f".format(f=filename))
with open(filename, 'w') as f:
for cat in subcats:
f.write(cat+"n")
def subcategoriesWithoutDuplicates(category):
"""Generate a list of subcategories without duplicates."""
return set(getSubcategories(category))
def retreiveSubcategoriesFromLocation(category):
"""Get subcategories from file, or generate them from scratch."""
subCats =
fileName = "category_subcats.txt".format(category=category)
try:
subCatFile = open(fileName, 'r')
print_debug("Reading from filename".format(filename=fileName))
for count, line in enumerate(subCatFile):
subCats.append(line.replace("n", ""))
subCatFile.close()
except IOError as ioError:
print_debug("fileName does not exist. Building from " +
" network".format(fileName=fileName)
)
subCats = subcategoriesWithoutDuplicates(category)
return subCats
def checkPageSimilarity(page, subcategories):
"""Check the similarity of page to a list of subcategories.
Verify if page truly is a subpage of a category.
"""
global similarityVal
pageCats = wrappedRequest(page, mode="Pagecats")
points = 0.0
# For every supercategory of page, if it is also in subcategories
# the page is more likely to be a true subpage.
for cat in pageCats:
title = cat['title']
if(title in subcategories):
points += 1.0
score = points/len(pageCats)
print_debug("Score of p is s".format(p=page, s=str(score)))
if(score >= similarityVal):
return True
return False
def randomPage(category, save, regen, check):
"""Generate a random page from a category."""
global DEBUGGING
subCats =
read = True
if(not regen):
subCats = retreiveSubcategoriesFromLocation(category)
if(regen or (not read)):
print_debug("Rebuilding category".format(category=category))
subCats = subcategoriesWithoutDuplicates(category)
if(save or regen):
saveArray(category, subCats)
randomPage = None
validRandomPage = True
cat = random.sample(subCats, 1)[0]
print_debug("Chose category cat".format(cat=cat))
pages = wrappedRequest(cat, mode="Subpage")
while(not randomPage or not validRandomPage):
try:
randomPage = random.choice(pages)
title = randomPage['title']
if(check):
print_debug("Checking " + title)
validRandomPage = checkPageSimilarity(title, subCats)
if(not validRandomPage):
pages.remove(randomPage)
except IndexError as a:
print_debug("cat has no pages. Retrying".format(cat=cat))
cat = random.sample(subCats, 1)[0]
print_debug("Chose category cat".format(cat=cat))
pages = wrappedRequest(cat, mode="Subpage")
return randomPage['title']
if(__name__ == "__main__"):
parser = argparse.ArgumentParser(description='Get a random page from a ' +
'wikipedia category')
parser.add_argument('category', help="The category you wish to get a " +
"page from."
)
parser.add_argument('--tree_depth',
nargs='?',
type=int,
default=4,
help="How far down to traverse the subcategory tree"
)
parser.add_argument('--similarity',
nargs='?',
type=float,
default=.5,
help="What percent of page categories need to be " +
"in subcategory array. Must be used with -c/--check")
parser.add_argument("-s",
"--save",
action="store_true",
help="Save subcategories to a file for quick re-runs"
)
parser.add_argument("-r",
"--regen",
action="store_true",
help="Regenerate the subcategory file"
)
parser.add_argument("-v",
"--verbose",
action="store_true",
help="Print debug lines"
)
parser.add_argument("-c",
"--check",
action="store_true",
help="After finding page check to see that it truly " +
"fits in category"
)
args = parser.parse_args()
print_debug(str(args.check))
DEBUGGING = args.verbose
max_depth = args.tree_depth
similarityVal = args.similarity
if(args.save):
print_debug("Saving!")
if(args.regen):
print_debug("Regenerating!")
print("https://en.wikipedia.org/wiki/" + randomPage("Category:" +
args.category,
save=args.save,
regen=args.regen,
check=args.check
)
)
Currently it runs fairly slowly, since it runs over networks via Python's requests module. Is there any way to make requests more efficient/faster or to get the Wikipedia subcategory tree locally (preferably in a small-ish format)?
I'd appreciate any feedback on the style/structure of my code (e.g. readability, function/variable name, function structure) and any advice on the performance of the program itself.
python python-3.x web-scraping api wikipedia
add a comment |Â
up vote
5
down vote
favorite
I recently wrote a Python script to generate a random page within a Wikipedia category and its subcategories:
"""Generate a random page from a wikipedia category."""
import argparse
import random
import requests
DEBUGGING = False
max_depth = 4
current_depth = 0
header = "Garrett Credi's Random Page Bot(Contact @ gcc@ameritech.net)"
headerVal = 'Api-User-Agent': header
base_url = 'https://en.wikipedia.org/w/api.php'
def print_debug(str):
"""Print strings if in debug/verbose mode mode."""
global DEBUGGING
if(DEBUGGING):
print("DEBUG: " + str)
def generateRequestsParams(category, mode):
"""Generate the params for requests given a category and a mode."""
cmtype = ""
if(mode == "Subcat"):
cmtype = 'subcat'
elif(mode == "Subpage"):
cmtype = 'page'
params =
'format': 'json',
'action': 'query',
'list': 'categorymembers',
'cmtitle': category,
'cmlimit': 500,
'cmtype': cmtype
if(mode == "Pagecats"):
params =
'format': 'json',
'action': 'query',
'titles': category,
'prop': 'categories'
return params
def wrappedRequest(category, mode):
"""Wrap a request to deal with connection errors."""
global base_url
params = generateRequestsParams(category, mode)
global headerVal
max_times = 5
times = 0
propertyString = 'categorymembers'
while(times < max_times):
try:
r = requests.get(base_url, headers=headerVal, params=params)
if(mode != "Pagecats"):
return r.json()['query'][propertyString]
else:
for key in r.json()['query']['pages']:
return r.json()['query']['pages'][key]['categories']
except requests.exceptions.ConnectionError as e:
if(times > max_times):
print_debug("category failed too many times (times) " +
" times. Moving on".format(
category=category,
times=times
)
)
times = 0
return [category]
else:
print_debug("Retrying category due to connection " +
" error".format(
cateogry=category
)
)
times += 1
def getSubcategories(category):
"""Get subcategories of a given subcategory."""
global max_depth, DEBUGGING
current_depth = 1
singleStepSubcategories = [category]
allSubcategories =
while(current_depth <= max_depth):
print_debug("Current tree depth d".format(d=current_depth))
subcategoryTemp =
if(len(singleStepSubcategories) == 0):
break
for subcat in singleStepSubcategories:
allSubcategories.append(subcat)
subcategories = wrappedRequest(subcat, mode="Subcat")
for cat in subcategories:
title = cat['title']
print_debug("subcat has subcategory title".format(
subcat=subcat,
title=title
)
)
if(title not in allSubcategories):
allSubcategories.append(title)
subcategoryTemp.append(title)
else:
print_debug("t already checked. Moving on".format(
t=title
)
)
singleStepSubcategories = subcategoryTemp
current_depth += 1
return allSubcategories
def saveArray(category, subcats):
"""Save array to file."""
filename = "category_subcats.txt".format(category=category)
print_debug("Saving to f".format(f=filename))
with open(filename, 'w') as f:
for cat in subcats:
f.write(cat+"n")
def subcategoriesWithoutDuplicates(category):
"""Generate a list of subcategories without duplicates."""
return set(getSubcategories(category))
def retreiveSubcategoriesFromLocation(category):
"""Get subcategories from file, or generate them from scratch."""
subCats =
fileName = "category_subcats.txt".format(category=category)
try:
subCatFile = open(fileName, 'r')
print_debug("Reading from filename".format(filename=fileName))
for count, line in enumerate(subCatFile):
subCats.append(line.replace("n", ""))
subCatFile.close()
except IOError as ioError:
print_debug("fileName does not exist. Building from " +
" network".format(fileName=fileName)
)
subCats = subcategoriesWithoutDuplicates(category)
return subCats
def checkPageSimilarity(page, subcategories):
"""Check the similarity of page to a list of subcategories.
Verify if page truly is a subpage of a category.
"""
global similarityVal
pageCats = wrappedRequest(page, mode="Pagecats")
points = 0.0
# For every supercategory of page, if it is also in subcategories
# the page is more likely to be a true subpage.
for cat in pageCats:
title = cat['title']
if(title in subcategories):
points += 1.0
score = points/len(pageCats)
print_debug("Score of p is s".format(p=page, s=str(score)))
if(score >= similarityVal):
return True
return False
def randomPage(category, save, regen, check):
"""Generate a random page from a category."""
global DEBUGGING
subCats =
read = True
if(not regen):
subCats = retreiveSubcategoriesFromLocation(category)
if(regen or (not read)):
print_debug("Rebuilding category".format(category=category))
subCats = subcategoriesWithoutDuplicates(category)
if(save or regen):
saveArray(category, subCats)
randomPage = None
validRandomPage = True
cat = random.sample(subCats, 1)[0]
print_debug("Chose category cat".format(cat=cat))
pages = wrappedRequest(cat, mode="Subpage")
while(not randomPage or not validRandomPage):
try:
randomPage = random.choice(pages)
title = randomPage['title']
if(check):
print_debug("Checking " + title)
validRandomPage = checkPageSimilarity(title, subCats)
if(not validRandomPage):
pages.remove(randomPage)
except IndexError as a:
print_debug("cat has no pages. Retrying".format(cat=cat))
cat = random.sample(subCats, 1)[0]
print_debug("Chose category cat".format(cat=cat))
pages = wrappedRequest(cat, mode="Subpage")
return randomPage['title']
if(__name__ == "__main__"):
parser = argparse.ArgumentParser(description='Get a random page from a ' +
'wikipedia category')
parser.add_argument('category', help="The category you wish to get a " +
"page from."
)
parser.add_argument('--tree_depth',
nargs='?',
type=int,
default=4,
help="How far down to traverse the subcategory tree"
)
parser.add_argument('--similarity',
nargs='?',
type=float,
default=.5,
help="What percent of page categories need to be " +
"in subcategory array. Must be used with -c/--check")
parser.add_argument("-s",
"--save",
action="store_true",
help="Save subcategories to a file for quick re-runs"
)
parser.add_argument("-r",
"--regen",
action="store_true",
help="Regenerate the subcategory file"
)
parser.add_argument("-v",
"--verbose",
action="store_true",
help="Print debug lines"
)
parser.add_argument("-c",
"--check",
action="store_true",
help="After finding page check to see that it truly " +
"fits in category"
)
args = parser.parse_args()
print_debug(str(args.check))
DEBUGGING = args.verbose
max_depth = args.tree_depth
similarityVal = args.similarity
if(args.save):
print_debug("Saving!")
if(args.regen):
print_debug("Regenerating!")
print("https://en.wikipedia.org/wiki/" + randomPage("Category:" +
args.category,
save=args.save,
regen=args.regen,
check=args.check
)
)
Currently it runs fairly slowly, since it runs over networks via Python's requests module. Is there any way to make requests more efficient/faster or to get the Wikipedia subcategory tree locally (preferably in a small-ish format)?
I'd appreciate any feedback on the style/structure of my code (e.g. readability, function/variable name, function structure) and any advice on the performance of the program itself.
python python-3.x web-scraping api wikipedia
add a comment |Â
up vote
5
down vote
favorite
up vote
5
down vote
favorite
I recently wrote a Python script to generate a random page within a Wikipedia category and its subcategories:
"""Generate a random page from a wikipedia category."""
import argparse
import random
import requests
DEBUGGING = False
max_depth = 4
current_depth = 0
header = "Garrett Credi's Random Page Bot(Contact @ gcc@ameritech.net)"
headerVal = 'Api-User-Agent': header
base_url = 'https://en.wikipedia.org/w/api.php'
def print_debug(str):
"""Print strings if in debug/verbose mode mode."""
global DEBUGGING
if(DEBUGGING):
print("DEBUG: " + str)
def generateRequestsParams(category, mode):
"""Generate the params for requests given a category and a mode."""
cmtype = ""
if(mode == "Subcat"):
cmtype = 'subcat'
elif(mode == "Subpage"):
cmtype = 'page'
params =
'format': 'json',
'action': 'query',
'list': 'categorymembers',
'cmtitle': category,
'cmlimit': 500,
'cmtype': cmtype
if(mode == "Pagecats"):
params =
'format': 'json',
'action': 'query',
'titles': category,
'prop': 'categories'
return params
def wrappedRequest(category, mode):
"""Wrap a request to deal with connection errors."""
global base_url
params = generateRequestsParams(category, mode)
global headerVal
max_times = 5
times = 0
propertyString = 'categorymembers'
while(times < max_times):
try:
r = requests.get(base_url, headers=headerVal, params=params)
if(mode != "Pagecats"):
return r.json()['query'][propertyString]
else:
for key in r.json()['query']['pages']:
return r.json()['query']['pages'][key]['categories']
except requests.exceptions.ConnectionError as e:
if(times > max_times):
print_debug("category failed too many times (times) " +
" times. Moving on".format(
category=category,
times=times
)
)
times = 0
return [category]
else:
print_debug("Retrying category due to connection " +
" error".format(
cateogry=category
)
)
times += 1
def getSubcategories(category):
"""Get subcategories of a given subcategory."""
global max_depth, DEBUGGING
current_depth = 1
singleStepSubcategories = [category]
allSubcategories =
while(current_depth <= max_depth):
print_debug("Current tree depth d".format(d=current_depth))
subcategoryTemp =
if(len(singleStepSubcategories) == 0):
break
for subcat in singleStepSubcategories:
allSubcategories.append(subcat)
subcategories = wrappedRequest(subcat, mode="Subcat")
for cat in subcategories:
title = cat['title']
print_debug("subcat has subcategory title".format(
subcat=subcat,
title=title
)
)
if(title not in allSubcategories):
allSubcategories.append(title)
subcategoryTemp.append(title)
else:
print_debug("t already checked. Moving on".format(
t=title
)
)
singleStepSubcategories = subcategoryTemp
current_depth += 1
return allSubcategories
def saveArray(category, subcats):
"""Save array to file."""
filename = "category_subcats.txt".format(category=category)
print_debug("Saving to f".format(f=filename))
with open(filename, 'w') as f:
for cat in subcats:
f.write(cat+"n")
def subcategoriesWithoutDuplicates(category):
"""Generate a list of subcategories without duplicates."""
return set(getSubcategories(category))
def retreiveSubcategoriesFromLocation(category):
"""Get subcategories from file, or generate them from scratch."""
subCats =
fileName = "category_subcats.txt".format(category=category)
try:
subCatFile = open(fileName, 'r')
print_debug("Reading from filename".format(filename=fileName))
for count, line in enumerate(subCatFile):
subCats.append(line.replace("n", ""))
subCatFile.close()
except IOError as ioError:
print_debug("fileName does not exist. Building from " +
" network".format(fileName=fileName)
)
subCats = subcategoriesWithoutDuplicates(category)
return subCats
def checkPageSimilarity(page, subcategories):
"""Check the similarity of page to a list of subcategories.
Verify if page truly is a subpage of a category.
"""
global similarityVal
pageCats = wrappedRequest(page, mode="Pagecats")
points = 0.0
# For every supercategory of page, if it is also in subcategories
# the page is more likely to be a true subpage.
for cat in pageCats:
title = cat['title']
if(title in subcategories):
points += 1.0
score = points/len(pageCats)
print_debug("Score of p is s".format(p=page, s=str(score)))
if(score >= similarityVal):
return True
return False
def randomPage(category, save, regen, check):
"""Generate a random page from a category."""
global DEBUGGING
subCats =
read = True
if(not regen):
subCats = retreiveSubcategoriesFromLocation(category)
if(regen or (not read)):
print_debug("Rebuilding category".format(category=category))
subCats = subcategoriesWithoutDuplicates(category)
if(save or regen):
saveArray(category, subCats)
randomPage = None
validRandomPage = True
cat = random.sample(subCats, 1)[0]
print_debug("Chose category cat".format(cat=cat))
pages = wrappedRequest(cat, mode="Subpage")
while(not randomPage or not validRandomPage):
try:
randomPage = random.choice(pages)
title = randomPage['title']
if(check):
print_debug("Checking " + title)
validRandomPage = checkPageSimilarity(title, subCats)
if(not validRandomPage):
pages.remove(randomPage)
except IndexError as a:
print_debug("cat has no pages. Retrying".format(cat=cat))
cat = random.sample(subCats, 1)[0]
print_debug("Chose category cat".format(cat=cat))
pages = wrappedRequest(cat, mode="Subpage")
return randomPage['title']
if(__name__ == "__main__"):
parser = argparse.ArgumentParser(description='Get a random page from a ' +
'wikipedia category')
parser.add_argument('category', help="The category you wish to get a " +
"page from."
)
parser.add_argument('--tree_depth',
nargs='?',
type=int,
default=4,
help="How far down to traverse the subcategory tree"
)
parser.add_argument('--similarity',
nargs='?',
type=float,
default=.5,
help="What percent of page categories need to be " +
"in subcategory array. Must be used with -c/--check")
parser.add_argument("-s",
"--save",
action="store_true",
help="Save subcategories to a file for quick re-runs"
)
parser.add_argument("-r",
"--regen",
action="store_true",
help="Regenerate the subcategory file"
)
parser.add_argument("-v",
"--verbose",
action="store_true",
help="Print debug lines"
)
parser.add_argument("-c",
"--check",
action="store_true",
help="After finding page check to see that it truly " +
"fits in category"
)
args = parser.parse_args()
print_debug(str(args.check))
DEBUGGING = args.verbose
max_depth = args.tree_depth
similarityVal = args.similarity
if(args.save):
print_debug("Saving!")
if(args.regen):
print_debug("Regenerating!")
print("https://en.wikipedia.org/wiki/" + randomPage("Category:" +
args.category,
save=args.save,
regen=args.regen,
check=args.check
)
)
Currently it runs fairly slowly, since it runs over networks via Python's requests module. Is there any way to make requests more efficient/faster or to get the Wikipedia subcategory tree locally (preferably in a small-ish format)?
I'd appreciate any feedback on the style/structure of my code (e.g. readability, function/variable name, function structure) and any advice on the performance of the program itself.
python python-3.x web-scraping api wikipedia
I recently wrote a Python script to generate a random page within a Wikipedia category and its subcategories:
"""Generate a random page from a wikipedia category."""
import argparse
import random
import requests
DEBUGGING = False
max_depth = 4
current_depth = 0
header = "Garrett Credi's Random Page Bot(Contact @ gcc@ameritech.net)"
headerVal = 'Api-User-Agent': header
base_url = 'https://en.wikipedia.org/w/api.php'
def print_debug(str):
"""Print strings if in debug/verbose mode mode."""
global DEBUGGING
if(DEBUGGING):
print("DEBUG: " + str)
def generateRequestsParams(category, mode):
"""Generate the params for requests given a category and a mode."""
cmtype = ""
if(mode == "Subcat"):
cmtype = 'subcat'
elif(mode == "Subpage"):
cmtype = 'page'
params =
'format': 'json',
'action': 'query',
'list': 'categorymembers',
'cmtitle': category,
'cmlimit': 500,
'cmtype': cmtype
if(mode == "Pagecats"):
params =
'format': 'json',
'action': 'query',
'titles': category,
'prop': 'categories'
return params
def wrappedRequest(category, mode):
"""Wrap a request to deal with connection errors."""
global base_url
params = generateRequestsParams(category, mode)
global headerVal
max_times = 5
times = 0
propertyString = 'categorymembers'
while(times < max_times):
try:
r = requests.get(base_url, headers=headerVal, params=params)
if(mode != "Pagecats"):
return r.json()['query'][propertyString]
else:
for key in r.json()['query']['pages']:
return r.json()['query']['pages'][key]['categories']
except requests.exceptions.ConnectionError as e:
if(times > max_times):
print_debug("category failed too many times (times) " +
" times. Moving on".format(
category=category,
times=times
)
)
times = 0
return [category]
else:
print_debug("Retrying category due to connection " +
" error".format(
cateogry=category
)
)
times += 1
def getSubcategories(category):
"""Get subcategories of a given subcategory."""
global max_depth, DEBUGGING
current_depth = 1
singleStepSubcategories = [category]
allSubcategories =
while(current_depth <= max_depth):
print_debug("Current tree depth d".format(d=current_depth))
subcategoryTemp =
if(len(singleStepSubcategories) == 0):
break
for subcat in singleStepSubcategories:
allSubcategories.append(subcat)
subcategories = wrappedRequest(subcat, mode="Subcat")
for cat in subcategories:
title = cat['title']
print_debug("subcat has subcategory title".format(
subcat=subcat,
title=title
)
)
if(title not in allSubcategories):
allSubcategories.append(title)
subcategoryTemp.append(title)
else:
print_debug("t already checked. Moving on".format(
t=title
)
)
singleStepSubcategories = subcategoryTemp
current_depth += 1
return allSubcategories
def saveArray(category, subcats):
"""Save array to file."""
filename = "category_subcats.txt".format(category=category)
print_debug("Saving to f".format(f=filename))
with open(filename, 'w') as f:
for cat in subcats:
f.write(cat+"n")
def subcategoriesWithoutDuplicates(category):
"""Generate a list of subcategories without duplicates."""
return set(getSubcategories(category))
def retreiveSubcategoriesFromLocation(category):
"""Get subcategories from file, or generate them from scratch."""
subCats =
fileName = "category_subcats.txt".format(category=category)
try:
subCatFile = open(fileName, 'r')
print_debug("Reading from filename".format(filename=fileName))
for count, line in enumerate(subCatFile):
subCats.append(line.replace("n", ""))
subCatFile.close()
except IOError as ioError:
print_debug("fileName does not exist. Building from " +
" network".format(fileName=fileName)
)
subCats = subcategoriesWithoutDuplicates(category)
return subCats
def checkPageSimilarity(page, subcategories):
"""Check the similarity of page to a list of subcategories.
Verify if page truly is a subpage of a category.
"""
global similarityVal
pageCats = wrappedRequest(page, mode="Pagecats")
points = 0.0
# For every supercategory of page, if it is also in subcategories
# the page is more likely to be a true subpage.
for cat in pageCats:
title = cat['title']
if(title in subcategories):
points += 1.0
score = points/len(pageCats)
print_debug("Score of p is s".format(p=page, s=str(score)))
if(score >= similarityVal):
return True
return False
def randomPage(category, save, regen, check):
"""Generate a random page from a category."""
global DEBUGGING
subCats =
read = True
if(not regen):
subCats = retreiveSubcategoriesFromLocation(category)
if(regen or (not read)):
print_debug("Rebuilding category".format(category=category))
subCats = subcategoriesWithoutDuplicates(category)
if(save or regen):
saveArray(category, subCats)
randomPage = None
validRandomPage = True
cat = random.sample(subCats, 1)[0]
print_debug("Chose category cat".format(cat=cat))
pages = wrappedRequest(cat, mode="Subpage")
while(not randomPage or not validRandomPage):
try:
randomPage = random.choice(pages)
title = randomPage['title']
if(check):
print_debug("Checking " + title)
validRandomPage = checkPageSimilarity(title, subCats)
if(not validRandomPage):
pages.remove(randomPage)
except IndexError as a:
print_debug("cat has no pages. Retrying".format(cat=cat))
cat = random.sample(subCats, 1)[0]
print_debug("Chose category cat".format(cat=cat))
pages = wrappedRequest(cat, mode="Subpage")
return randomPage['title']
if(__name__ == "__main__"):
parser = argparse.ArgumentParser(description='Get a random page from a ' +
'wikipedia category')
parser.add_argument('category', help="The category you wish to get a " +
"page from."
)
parser.add_argument('--tree_depth',
nargs='?',
type=int,
default=4,
help="How far down to traverse the subcategory tree"
)
parser.add_argument('--similarity',
nargs='?',
type=float,
default=.5,
help="What percent of page categories need to be " +
"in subcategory array. Must be used with -c/--check")
parser.add_argument("-s",
"--save",
action="store_true",
help="Save subcategories to a file for quick re-runs"
)
parser.add_argument("-r",
"--regen",
action="store_true",
help="Regenerate the subcategory file"
)
parser.add_argument("-v",
"--verbose",
action="store_true",
help="Print debug lines"
)
parser.add_argument("-c",
"--check",
action="store_true",
help="After finding page check to see that it truly " +
"fits in category"
)
args = parser.parse_args()
print_debug(str(args.check))
DEBUGGING = args.verbose
max_depth = args.tree_depth
similarityVal = args.similarity
if(args.save):
print_debug("Saving!")
if(args.regen):
print_debug("Regenerating!")
print("https://en.wikipedia.org/wiki/" + randomPage("Category:" +
args.category,
save=args.save,
regen=args.regen,
check=args.check
)
)
Currently it runs fairly slowly, since it runs over networks via Python's requests module. Is there any way to make requests more efficient/faster or to get the Wikipedia subcategory tree locally (preferably in a small-ish format)?
I'd appreciate any feedback on the style/structure of my code (e.g. readability, function/variable name, function structure) and any advice on the performance of the program itself.
python python-3.x web-scraping api wikipedia
edited May 4 at 12:38
Daniel
4,1132836
4,1132836
asked May 2 at 2:20
Garrett Credi
363
363
add a comment |Â
add a comment |Â
1 Answer
1
active
oldest
votes
up vote
2
down vote
General
In
print_debug()
, you don't have to use theglobal
keyword to refer toDEBUGGING
. If the Python interpreter can't find the nameDEBUGGING
locally, it will then search for it globally. If it still can't find it, aNameError
is raised. The only two reasons to useglobal
are:When you have a local and a global variable with the same name, and you explicitly want to refer to the global variable;
In local scope, when you need to (re)assign to a global variable.
In
print_debug()
, you're shadowing the built-instr
. To avoid shadowing a variable, by convention, you should add a trailing underscore (as instr_
). If you find that ugly, you can also spell it out, or abbreviate further:string
ors
(the former is more desirable).You don't need parentheses around
if
-statements andwhile
-statements.The idiomatic way of checking if a container is empty in Python is to directly use it in an
if
-statement, in this fashion:if not container:
# Container is empty... this works because a container's
__bool__()
method returnsTrue
if it is not empty, andFalse
otherwise.When catching an exception, if you don't need access to the exception instance itself, you should leave out the
as ...
part.If you do need access to the exception instance, most people use:
except <exception type> as exc:
... or:
except <exception type> as err:
The following:
if <boolean expression>:
return True
return False... can be shortened to:
return <boolean expression>
In
randomPage()
, the following:if(regen or (not read)):
... can be simplified to become:
if regen or not read:
Avoid global variables. They are a telltale sign of a design problem in your code. Global constants are acceptable, but non-constant global variables can cause all kinds of trouble:
It's hard to track where they are being used and modified. This problem becomes very prominent when using threads;
Loading a global variable is more costly than loading a local one;
If you design an API and have lots of global variables floating about, when someone performs a wildcard import, their global namespace will be cluttered.
Generally, if you find yourself doing this:
my_list =
for x in some_container:
if some_condition_applies(x):
my_list.append(x)... a list comprehension would be a good fit:
my_list = [x for x in some_container if some_condition_applies(x)]
A list comprehensions is shorter and often faster than its
for
-loop counterpart.Debug messages should be sent to stderr, not stdout:
import sys
...
print("DEBUG: " + str, file=sys.stderr)
PEP-8
PEP-8 is the name of the official Python style guide. You violated it a couple of times:
Your indentation is all over the place. Sorry, but it had to be said. Take this excerpt:
print_debug("t already checked. Moving on".format(
t=title
)
)... isn't that hard to read? Something like this would be much easier on the eyes:
print_debug(
"t already checked. Moving on".format(t=title)
)... or this:
print_debug("t already checked. Moving on".format(
t=title)
)Ultimately, it's up to you. PEP-8 also lists some examples of acceptable styles.1
Use
snake_case
for function and variable names. Only useUPPERCASE_WITH_UNDERSCORES
for constants.2Don't mix single and double quotes. I prefer double quotes, because they're less likely to cause clashes with quotation marks in flowing text.3
Limit the line length to 79 characters.4
Docstrings
Good job on adding docstrings to your functions and to the module itself. Most people (including me) often can't bring up the effort to add documentation, but you have! :) If you want to help yourself understand the code in 6 months' time, or if you want to publish this as an API, you should be a bit more thorough, for instance by adding information about:
What arguments the function takes, what type they should be, and exactly what they convey;
The return type of the value;
Any special cases the caller should be aware of.
There's even a style guide for docstrings: PEP-257.
Performance
I don't currently have the time to do a full review on performance, but I suggest you run a profiler to see where you can optimize. The problem is very likely to be I/O-bound, but the debugging can have a noticable performance hit.
If you want to squeeze a little bit of extra speed out of requests
, you can use a requests.Session
object, which:
... [omitted] will use urllib3's connection pooling. So if you're making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase ... [omitted]
References
1 PEP-8: Indentation
2 PEP-8: Function and variable names
3 PEP-8: String Quotes
4 PEP-8: Maximum Line Length
add a comment |Â
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
2
down vote
General
In
print_debug()
, you don't have to use theglobal
keyword to refer toDEBUGGING
. If the Python interpreter can't find the nameDEBUGGING
locally, it will then search for it globally. If it still can't find it, aNameError
is raised. The only two reasons to useglobal
are:When you have a local and a global variable with the same name, and you explicitly want to refer to the global variable;
In local scope, when you need to (re)assign to a global variable.
In
print_debug()
, you're shadowing the built-instr
. To avoid shadowing a variable, by convention, you should add a trailing underscore (as instr_
). If you find that ugly, you can also spell it out, or abbreviate further:string
ors
(the former is more desirable).You don't need parentheses around
if
-statements andwhile
-statements.The idiomatic way of checking if a container is empty in Python is to directly use it in an
if
-statement, in this fashion:if not container:
# Container is empty... this works because a container's
__bool__()
method returnsTrue
if it is not empty, andFalse
otherwise.When catching an exception, if you don't need access to the exception instance itself, you should leave out the
as ...
part.If you do need access to the exception instance, most people use:
except <exception type> as exc:
... or:
except <exception type> as err:
The following:
if <boolean expression>:
return True
return False... can be shortened to:
return <boolean expression>
In
randomPage()
, the following:if(regen or (not read)):
... can be simplified to become:
if regen or not read:
Avoid global variables. They are a telltale sign of a design problem in your code. Global constants are acceptable, but non-constant global variables can cause all kinds of trouble:
It's hard to track where they are being used and modified. This problem becomes very prominent when using threads;
Loading a global variable is more costly than loading a local one;
If you design an API and have lots of global variables floating about, when someone performs a wildcard import, their global namespace will be cluttered.
Generally, if you find yourself doing this:
my_list =
for x in some_container:
if some_condition_applies(x):
my_list.append(x)... a list comprehension would be a good fit:
my_list = [x for x in some_container if some_condition_applies(x)]
A list comprehensions is shorter and often faster than its
for
-loop counterpart.Debug messages should be sent to stderr, not stdout:
import sys
...
print("DEBUG: " + str, file=sys.stderr)
PEP-8
PEP-8 is the name of the official Python style guide. You violated it a couple of times:
Your indentation is all over the place. Sorry, but it had to be said. Take this excerpt:
print_debug("t already checked. Moving on".format(
t=title
)
)... isn't that hard to read? Something like this would be much easier on the eyes:
print_debug(
"t already checked. Moving on".format(t=title)
)... or this:
print_debug("t already checked. Moving on".format(
t=title)
)Ultimately, it's up to you. PEP-8 also lists some examples of acceptable styles.1
Use
snake_case
for function and variable names. Only useUPPERCASE_WITH_UNDERSCORES
for constants.2Don't mix single and double quotes. I prefer double quotes, because they're less likely to cause clashes with quotation marks in flowing text.3
Limit the line length to 79 characters.4
Docstrings
Good job on adding docstrings to your functions and to the module itself. Most people (including me) often can't bring up the effort to add documentation, but you have! :) If you want to help yourself understand the code in 6 months' time, or if you want to publish this as an API, you should be a bit more thorough, for instance by adding information about:
What arguments the function takes, what type they should be, and exactly what they convey;
The return type of the value;
Any special cases the caller should be aware of.
There's even a style guide for docstrings: PEP-257.
Performance
I don't currently have the time to do a full review on performance, but I suggest you run a profiler to see where you can optimize. The problem is very likely to be I/O-bound, but the debugging can have a noticable performance hit.
If you want to squeeze a little bit of extra speed out of requests
, you can use a requests.Session
object, which:
... [omitted] will use urllib3's connection pooling. So if you're making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase ... [omitted]
References
1 PEP-8: Indentation
2 PEP-8: Function and variable names
3 PEP-8: String Quotes
4 PEP-8: Maximum Line Length
add a comment |Â
up vote
2
down vote
General
In
print_debug()
, you don't have to use theglobal
keyword to refer toDEBUGGING
. If the Python interpreter can't find the nameDEBUGGING
locally, it will then search for it globally. If it still can't find it, aNameError
is raised. The only two reasons to useglobal
are:When you have a local and a global variable with the same name, and you explicitly want to refer to the global variable;
In local scope, when you need to (re)assign to a global variable.
In
print_debug()
, you're shadowing the built-instr
. To avoid shadowing a variable, by convention, you should add a trailing underscore (as instr_
). If you find that ugly, you can also spell it out, or abbreviate further:string
ors
(the former is more desirable).You don't need parentheses around
if
-statements andwhile
-statements.The idiomatic way of checking if a container is empty in Python is to directly use it in an
if
-statement, in this fashion:if not container:
# Container is empty... this works because a container's
__bool__()
method returnsTrue
if it is not empty, andFalse
otherwise.When catching an exception, if you don't need access to the exception instance itself, you should leave out the
as ...
part.If you do need access to the exception instance, most people use:
except <exception type> as exc:
... or:
except <exception type> as err:
The following:
if <boolean expression>:
return True
return False... can be shortened to:
return <boolean expression>
In
randomPage()
, the following:if(regen or (not read)):
... can be simplified to become:
if regen or not read:
Avoid global variables. They are a telltale sign of a design problem in your code. Global constants are acceptable, but non-constant global variables can cause all kinds of trouble:
It's hard to track where they are being used and modified. This problem becomes very prominent when using threads;
Loading a global variable is more costly than loading a local one;
If you design an API and have lots of global variables floating about, when someone performs a wildcard import, their global namespace will be cluttered.
Generally, if you find yourself doing this:
my_list =
for x in some_container:
if some_condition_applies(x):
my_list.append(x)... a list comprehension would be a good fit:
my_list = [x for x in some_container if some_condition_applies(x)]
A list comprehensions is shorter and often faster than its
for
-loop counterpart.Debug messages should be sent to stderr, not stdout:
import sys
...
print("DEBUG: " + str, file=sys.stderr)
PEP-8
PEP-8 is the name of the official Python style guide. You violated it a couple of times:
Your indentation is all over the place. Sorry, but it had to be said. Take this excerpt:
print_debug("t already checked. Moving on".format(
t=title
)
)... isn't that hard to read? Something like this would be much easier on the eyes:
print_debug(
"t already checked. Moving on".format(t=title)
)... or this:
print_debug("t already checked. Moving on".format(
t=title)
)Ultimately, it's up to you. PEP-8 also lists some examples of acceptable styles.1
Use
snake_case
for function and variable names. Only useUPPERCASE_WITH_UNDERSCORES
for constants.2Don't mix single and double quotes. I prefer double quotes, because they're less likely to cause clashes with quotation marks in flowing text.3
Limit the line length to 79 characters.4
Docstrings
Good job on adding docstrings to your functions and to the module itself. Most people (including me) often can't bring up the effort to add documentation, but you have! :) If you want to help yourself understand the code in 6 months' time, or if you want to publish this as an API, you should be a bit more thorough, for instance by adding information about:
What arguments the function takes, what type they should be, and exactly what they convey;
The return type of the value;
Any special cases the caller should be aware of.
There's even a style guide for docstrings: PEP-257.
Performance
I don't currently have the time to do a full review on performance, but I suggest you run a profiler to see where you can optimize. The problem is very likely to be I/O-bound, but the debugging can have a noticable performance hit.
If you want to squeeze a little bit of extra speed out of requests
, you can use a requests.Session
object, which:
... [omitted] will use urllib3's connection pooling. So if you're making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase ... [omitted]
References
1 PEP-8: Indentation
2 PEP-8: Function and variable names
3 PEP-8: String Quotes
4 PEP-8: Maximum Line Length
add a comment |Â
up vote
2
down vote
up vote
2
down vote
General
In
print_debug()
, you don't have to use theglobal
keyword to refer toDEBUGGING
. If the Python interpreter can't find the nameDEBUGGING
locally, it will then search for it globally. If it still can't find it, aNameError
is raised. The only two reasons to useglobal
are:When you have a local and a global variable with the same name, and you explicitly want to refer to the global variable;
In local scope, when you need to (re)assign to a global variable.
In
print_debug()
, you're shadowing the built-instr
. To avoid shadowing a variable, by convention, you should add a trailing underscore (as instr_
). If you find that ugly, you can also spell it out, or abbreviate further:string
ors
(the former is more desirable).You don't need parentheses around
if
-statements andwhile
-statements.The idiomatic way of checking if a container is empty in Python is to directly use it in an
if
-statement, in this fashion:if not container:
# Container is empty... this works because a container's
__bool__()
method returnsTrue
if it is not empty, andFalse
otherwise.When catching an exception, if you don't need access to the exception instance itself, you should leave out the
as ...
part.If you do need access to the exception instance, most people use:
except <exception type> as exc:
... or:
except <exception type> as err:
The following:
if <boolean expression>:
return True
return False... can be shortened to:
return <boolean expression>
In
randomPage()
, the following:if(regen or (not read)):
... can be simplified to become:
if regen or not read:
Avoid global variables. They are a telltale sign of a design problem in your code. Global constants are acceptable, but non-constant global variables can cause all kinds of trouble:
It's hard to track where they are being used and modified. This problem becomes very prominent when using threads;
Loading a global variable is more costly than loading a local one;
If you design an API and have lots of global variables floating about, when someone performs a wildcard import, their global namespace will be cluttered.
Generally, if you find yourself doing this:
my_list =
for x in some_container:
if some_condition_applies(x):
my_list.append(x)... a list comprehension would be a good fit:
my_list = [x for x in some_container if some_condition_applies(x)]
A list comprehensions is shorter and often faster than its
for
-loop counterpart.Debug messages should be sent to stderr, not stdout:
import sys
...
print("DEBUG: " + str, file=sys.stderr)
PEP-8
PEP-8 is the name of the official Python style guide. You violated it a couple of times:
Your indentation is all over the place. Sorry, but it had to be said. Take this excerpt:
print_debug("t already checked. Moving on".format(
t=title
)
)... isn't that hard to read? Something like this would be much easier on the eyes:
print_debug(
"t already checked. Moving on".format(t=title)
)... or this:
print_debug("t already checked. Moving on".format(
t=title)
)Ultimately, it's up to you. PEP-8 also lists some examples of acceptable styles.1
Use
snake_case
for function and variable names. Only useUPPERCASE_WITH_UNDERSCORES
for constants.2Don't mix single and double quotes. I prefer double quotes, because they're less likely to cause clashes with quotation marks in flowing text.3
Limit the line length to 79 characters.4
Docstrings
Good job on adding docstrings to your functions and to the module itself. Most people (including me) often can't bring up the effort to add documentation, but you have! :) If you want to help yourself understand the code in 6 months' time, or if you want to publish this as an API, you should be a bit more thorough, for instance by adding information about:
What arguments the function takes, what type they should be, and exactly what they convey;
The return type of the value;
Any special cases the caller should be aware of.
There's even a style guide for docstrings: PEP-257.
Performance
I don't currently have the time to do a full review on performance, but I suggest you run a profiler to see where you can optimize. The problem is very likely to be I/O-bound, but the debugging can have a noticable performance hit.
If you want to squeeze a little bit of extra speed out of requests
, you can use a requests.Session
object, which:
... [omitted] will use urllib3's connection pooling. So if you're making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase ... [omitted]
References
1 PEP-8: Indentation
2 PEP-8: Function and variable names
3 PEP-8: String Quotes
4 PEP-8: Maximum Line Length
General
In
print_debug()
, you don't have to use theglobal
keyword to refer toDEBUGGING
. If the Python interpreter can't find the nameDEBUGGING
locally, it will then search for it globally. If it still can't find it, aNameError
is raised. The only two reasons to useglobal
are:When you have a local and a global variable with the same name, and you explicitly want to refer to the global variable;
In local scope, when you need to (re)assign to a global variable.
In
print_debug()
, you're shadowing the built-instr
. To avoid shadowing a variable, by convention, you should add a trailing underscore (as instr_
). If you find that ugly, you can also spell it out, or abbreviate further:string
ors
(the former is more desirable).You don't need parentheses around
if
-statements andwhile
-statements.The idiomatic way of checking if a container is empty in Python is to directly use it in an
if
-statement, in this fashion:if not container:
# Container is empty... this works because a container's
__bool__()
method returnsTrue
if it is not empty, andFalse
otherwise.When catching an exception, if you don't need access to the exception instance itself, you should leave out the
as ...
part.If you do need access to the exception instance, most people use:
except <exception type> as exc:
... or:
except <exception type> as err:
The following:
if <boolean expression>:
return True
return False... can be shortened to:
return <boolean expression>
In
randomPage()
, the following:if(regen or (not read)):
... can be simplified to become:
if regen or not read:
Avoid global variables. They are a telltale sign of a design problem in your code. Global constants are acceptable, but non-constant global variables can cause all kinds of trouble:
It's hard to track where they are being used and modified. This problem becomes very prominent when using threads;
Loading a global variable is more costly than loading a local one;
If you design an API and have lots of global variables floating about, when someone performs a wildcard import, their global namespace will be cluttered.
Generally, if you find yourself doing this:
my_list =
for x in some_container:
if some_condition_applies(x):
my_list.append(x)... a list comprehension would be a good fit:
my_list = [x for x in some_container if some_condition_applies(x)]
A list comprehensions is shorter and often faster than its
for
-loop counterpart.Debug messages should be sent to stderr, not stdout:
import sys
...
print("DEBUG: " + str, file=sys.stderr)
PEP-8
PEP-8 is the name of the official Python style guide. You violated it a couple of times:
Your indentation is all over the place. Sorry, but it had to be said. Take this excerpt:
print_debug("t already checked. Moving on".format(
t=title
)
)... isn't that hard to read? Something like this would be much easier on the eyes:
print_debug(
"t already checked. Moving on".format(t=title)
)... or this:
print_debug("t already checked. Moving on".format(
t=title)
)Ultimately, it's up to you. PEP-8 also lists some examples of acceptable styles.1
Use
snake_case
for function and variable names. Only useUPPERCASE_WITH_UNDERSCORES
for constants.2Don't mix single and double quotes. I prefer double quotes, because they're less likely to cause clashes with quotation marks in flowing text.3
Limit the line length to 79 characters.4
Docstrings
Good job on adding docstrings to your functions and to the module itself. Most people (including me) often can't bring up the effort to add documentation, but you have! :) If you want to help yourself understand the code in 6 months' time, or if you want to publish this as an API, you should be a bit more thorough, for instance by adding information about:
What arguments the function takes, what type they should be, and exactly what they convey;
The return type of the value;
Any special cases the caller should be aware of.
There's even a style guide for docstrings: PEP-257.
Performance
I don't currently have the time to do a full review on performance, but I suggest you run a profiler to see where you can optimize. The problem is very likely to be I/O-bound, but the debugging can have a noticable performance hit.
If you want to squeeze a little bit of extra speed out of requests
, you can use a requests.Session
object, which:
... [omitted] will use urllib3's connection pooling. So if you're making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase ... [omitted]
References
1 PEP-8: Indentation
2 PEP-8: Function and variable names
3 PEP-8: String Quotes
4 PEP-8: Maximum Line Length
edited May 4 at 11:27
answered May 4 at 10:34
Daniel
4,1132836
4,1132836
add a comment |Â
add a comment |Â
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f193411%2fwikipedia-random-page-in-category-bot%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password