Wikipedia Random Page in Category Bot

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
5
down vote

favorite












I recently wrote a Python script to generate a random page within a Wikipedia category and its subcategories:



"""Generate a random page from a wikipedia category."""
import argparse
import random

import requests

DEBUGGING = False
max_depth = 4
current_depth = 0
header = "Garrett Credi's Random Page Bot(Contact @ gcc@ameritech.net)"
headerVal = 'Api-User-Agent': header
base_url = 'https://en.wikipedia.org/w/api.php'


def print_debug(str):
"""Print strings if in debug/verbose mode mode."""
global DEBUGGING
if(DEBUGGING):
print("DEBUG: " + str)


def generateRequestsParams(category, mode):
"""Generate the params for requests given a category and a mode."""
cmtype = ""
if(mode == "Subcat"):
cmtype = 'subcat'
elif(mode == "Subpage"):
cmtype = 'page'
params =
'format': 'json',
'action': 'query',
'list': 'categorymembers',
'cmtitle': category,
'cmlimit': 500,
'cmtype': cmtype

if(mode == "Pagecats"):
params =
'format': 'json',
'action': 'query',
'titles': category,
'prop': 'categories'

return params


def wrappedRequest(category, mode):
"""Wrap a request to deal with connection errors."""
global base_url
params = generateRequestsParams(category, mode)
global headerVal
max_times = 5
times = 0
propertyString = 'categorymembers'
while(times < max_times):
try:
r = requests.get(base_url, headers=headerVal, params=params)
if(mode != "Pagecats"):
return r.json()['query'][propertyString]
else:
for key in r.json()['query']['pages']:
return r.json()['query']['pages'][key]['categories']
except requests.exceptions.ConnectionError as e:
if(times > max_times):
print_debug("category failed too many times (times) " +
" times. Moving on".format(
category=category,
times=times
)
)
times = 0
return [category]
else:
print_debug("Retrying category due to connection " +
" error".format(
cateogry=category
)
)
times += 1


def getSubcategories(category):
"""Get subcategories of a given subcategory."""
global max_depth, DEBUGGING
current_depth = 1
singleStepSubcategories = [category]
allSubcategories =
while(current_depth <= max_depth):
print_debug("Current tree depth d".format(d=current_depth))
subcategoryTemp =
if(len(singleStepSubcategories) == 0):
break
for subcat in singleStepSubcategories:
allSubcategories.append(subcat)
subcategories = wrappedRequest(subcat, mode="Subcat")
for cat in subcategories:
title = cat['title']
print_debug("subcat has subcategory title".format(
subcat=subcat,
title=title
)
)
if(title not in allSubcategories):
allSubcategories.append(title)
subcategoryTemp.append(title)
else:
print_debug("t already checked. Moving on".format(
t=title
)
)
singleStepSubcategories = subcategoryTemp
current_depth += 1
return allSubcategories


def saveArray(category, subcats):
"""Save array to file."""
filename = "category_subcats.txt".format(category=category)
print_debug("Saving to f".format(f=filename))
with open(filename, 'w') as f:
for cat in subcats:
f.write(cat+"n")


def subcategoriesWithoutDuplicates(category):
"""Generate a list of subcategories without duplicates."""
return set(getSubcategories(category))


def retreiveSubcategoriesFromLocation(category):
"""Get subcategories from file, or generate them from scratch."""
subCats =
fileName = "category_subcats.txt".format(category=category)
try:
subCatFile = open(fileName, 'r')
print_debug("Reading from filename".format(filename=fileName))
for count, line in enumerate(subCatFile):
subCats.append(line.replace("n", ""))
subCatFile.close()
except IOError as ioError:
print_debug("fileName does not exist. Building from " +
" network".format(fileName=fileName)
)
subCats = subcategoriesWithoutDuplicates(category)
return subCats


def checkPageSimilarity(page, subcategories):
"""Check the similarity of page to a list of subcategories.
Verify if page truly is a subpage of a category.
"""
global similarityVal
pageCats = wrappedRequest(page, mode="Pagecats")
points = 0.0
# For every supercategory of page, if it is also in subcategories
# the page is more likely to be a true subpage.
for cat in pageCats:
title = cat['title']
if(title in subcategories):
points += 1.0
score = points/len(pageCats)
print_debug("Score of p is s".format(p=page, s=str(score)))
if(score >= similarityVal):
return True
return False


def randomPage(category, save, regen, check):
"""Generate a random page from a category."""
global DEBUGGING
subCats =
read = True
if(not regen):
subCats = retreiveSubcategoriesFromLocation(category)
if(regen or (not read)):
print_debug("Rebuilding category".format(category=category))
subCats = subcategoriesWithoutDuplicates(category)
if(save or regen):
saveArray(category, subCats)
randomPage = None
validRandomPage = True
cat = random.sample(subCats, 1)[0]
print_debug("Chose category cat".format(cat=cat))
pages = wrappedRequest(cat, mode="Subpage")
while(not randomPage or not validRandomPage):
try:
randomPage = random.choice(pages)
title = randomPage['title']
if(check):
print_debug("Checking " + title)
validRandomPage = checkPageSimilarity(title, subCats)
if(not validRandomPage):
pages.remove(randomPage)
except IndexError as a:
print_debug("cat has no pages. Retrying".format(cat=cat))
cat = random.sample(subCats, 1)[0]
print_debug("Chose category cat".format(cat=cat))
pages = wrappedRequest(cat, mode="Subpage")
return randomPage['title']


if(__name__ == "__main__"):
parser = argparse.ArgumentParser(description='Get a random page from a ' +
'wikipedia category')
parser.add_argument('category', help="The category you wish to get a " +
"page from."
)
parser.add_argument('--tree_depth',
nargs='?',
type=int,
default=4,
help="How far down to traverse the subcategory tree"
)
parser.add_argument('--similarity',
nargs='?',
type=float,
default=.5,
help="What percent of page categories need to be " +
"in subcategory array. Must be used with -c/--check")
parser.add_argument("-s",
"--save",
action="store_true",
help="Save subcategories to a file for quick re-runs"
)
parser.add_argument("-r",
"--regen",
action="store_true",
help="Regenerate the subcategory file"
)
parser.add_argument("-v",
"--verbose",
action="store_true",
help="Print debug lines"
)
parser.add_argument("-c",
"--check",
action="store_true",
help="After finding page check to see that it truly " +
"fits in category"
)
args = parser.parse_args()
print_debug(str(args.check))
DEBUGGING = args.verbose
max_depth = args.tree_depth
similarityVal = args.similarity
if(args.save):
print_debug("Saving!")
if(args.regen):
print_debug("Regenerating!")

print("https://en.wikipedia.org/wiki/" + randomPage("Category:" +
args.category,
save=args.save,
regen=args.regen,
check=args.check
)
)


Currently it runs fairly slowly, since it runs over networks via Python's requests module. Is there any way to make requests more efficient/faster or to get the Wikipedia subcategory tree locally (preferably in a small-ish format)?



I'd appreciate any feedback on the style/structure of my code (e.g. readability, function/variable name, function structure) and any advice on the performance of the program itself.







share|improve this question



























    up vote
    5
    down vote

    favorite












    I recently wrote a Python script to generate a random page within a Wikipedia category and its subcategories:



    """Generate a random page from a wikipedia category."""
    import argparse
    import random

    import requests

    DEBUGGING = False
    max_depth = 4
    current_depth = 0
    header = "Garrett Credi's Random Page Bot(Contact @ gcc@ameritech.net)"
    headerVal = 'Api-User-Agent': header
    base_url = 'https://en.wikipedia.org/w/api.php'


    def print_debug(str):
    """Print strings if in debug/verbose mode mode."""
    global DEBUGGING
    if(DEBUGGING):
    print("DEBUG: " + str)


    def generateRequestsParams(category, mode):
    """Generate the params for requests given a category and a mode."""
    cmtype = ""
    if(mode == "Subcat"):
    cmtype = 'subcat'
    elif(mode == "Subpage"):
    cmtype = 'page'
    params =
    'format': 'json',
    'action': 'query',
    'list': 'categorymembers',
    'cmtitle': category,
    'cmlimit': 500,
    'cmtype': cmtype

    if(mode == "Pagecats"):
    params =
    'format': 'json',
    'action': 'query',
    'titles': category,
    'prop': 'categories'

    return params


    def wrappedRequest(category, mode):
    """Wrap a request to deal with connection errors."""
    global base_url
    params = generateRequestsParams(category, mode)
    global headerVal
    max_times = 5
    times = 0
    propertyString = 'categorymembers'
    while(times < max_times):
    try:
    r = requests.get(base_url, headers=headerVal, params=params)
    if(mode != "Pagecats"):
    return r.json()['query'][propertyString]
    else:
    for key in r.json()['query']['pages']:
    return r.json()['query']['pages'][key]['categories']
    except requests.exceptions.ConnectionError as e:
    if(times > max_times):
    print_debug("category failed too many times (times) " +
    " times. Moving on".format(
    category=category,
    times=times
    )
    )
    times = 0
    return [category]
    else:
    print_debug("Retrying category due to connection " +
    " error".format(
    cateogry=category
    )
    )
    times += 1


    def getSubcategories(category):
    """Get subcategories of a given subcategory."""
    global max_depth, DEBUGGING
    current_depth = 1
    singleStepSubcategories = [category]
    allSubcategories =
    while(current_depth <= max_depth):
    print_debug("Current tree depth d".format(d=current_depth))
    subcategoryTemp =
    if(len(singleStepSubcategories) == 0):
    break
    for subcat in singleStepSubcategories:
    allSubcategories.append(subcat)
    subcategories = wrappedRequest(subcat, mode="Subcat")
    for cat in subcategories:
    title = cat['title']
    print_debug("subcat has subcategory title".format(
    subcat=subcat,
    title=title
    )
    )
    if(title not in allSubcategories):
    allSubcategories.append(title)
    subcategoryTemp.append(title)
    else:
    print_debug("t already checked. Moving on".format(
    t=title
    )
    )
    singleStepSubcategories = subcategoryTemp
    current_depth += 1
    return allSubcategories


    def saveArray(category, subcats):
    """Save array to file."""
    filename = "category_subcats.txt".format(category=category)
    print_debug("Saving to f".format(f=filename))
    with open(filename, 'w') as f:
    for cat in subcats:
    f.write(cat+"n")


    def subcategoriesWithoutDuplicates(category):
    """Generate a list of subcategories without duplicates."""
    return set(getSubcategories(category))


    def retreiveSubcategoriesFromLocation(category):
    """Get subcategories from file, or generate them from scratch."""
    subCats =
    fileName = "category_subcats.txt".format(category=category)
    try:
    subCatFile = open(fileName, 'r')
    print_debug("Reading from filename".format(filename=fileName))
    for count, line in enumerate(subCatFile):
    subCats.append(line.replace("n", ""))
    subCatFile.close()
    except IOError as ioError:
    print_debug("fileName does not exist. Building from " +
    " network".format(fileName=fileName)
    )
    subCats = subcategoriesWithoutDuplicates(category)
    return subCats


    def checkPageSimilarity(page, subcategories):
    """Check the similarity of page to a list of subcategories.
    Verify if page truly is a subpage of a category.
    """
    global similarityVal
    pageCats = wrappedRequest(page, mode="Pagecats")
    points = 0.0
    # For every supercategory of page, if it is also in subcategories
    # the page is more likely to be a true subpage.
    for cat in pageCats:
    title = cat['title']
    if(title in subcategories):
    points += 1.0
    score = points/len(pageCats)
    print_debug("Score of p is s".format(p=page, s=str(score)))
    if(score >= similarityVal):
    return True
    return False


    def randomPage(category, save, regen, check):
    """Generate a random page from a category."""
    global DEBUGGING
    subCats =
    read = True
    if(not regen):
    subCats = retreiveSubcategoriesFromLocation(category)
    if(regen or (not read)):
    print_debug("Rebuilding category".format(category=category))
    subCats = subcategoriesWithoutDuplicates(category)
    if(save or regen):
    saveArray(category, subCats)
    randomPage = None
    validRandomPage = True
    cat = random.sample(subCats, 1)[0]
    print_debug("Chose category cat".format(cat=cat))
    pages = wrappedRequest(cat, mode="Subpage")
    while(not randomPage or not validRandomPage):
    try:
    randomPage = random.choice(pages)
    title = randomPage['title']
    if(check):
    print_debug("Checking " + title)
    validRandomPage = checkPageSimilarity(title, subCats)
    if(not validRandomPage):
    pages.remove(randomPage)
    except IndexError as a:
    print_debug("cat has no pages. Retrying".format(cat=cat))
    cat = random.sample(subCats, 1)[0]
    print_debug("Chose category cat".format(cat=cat))
    pages = wrappedRequest(cat, mode="Subpage")
    return randomPage['title']


    if(__name__ == "__main__"):
    parser = argparse.ArgumentParser(description='Get a random page from a ' +
    'wikipedia category')
    parser.add_argument('category', help="The category you wish to get a " +
    "page from."
    )
    parser.add_argument('--tree_depth',
    nargs='?',
    type=int,
    default=4,
    help="How far down to traverse the subcategory tree"
    )
    parser.add_argument('--similarity',
    nargs='?',
    type=float,
    default=.5,
    help="What percent of page categories need to be " +
    "in subcategory array. Must be used with -c/--check")
    parser.add_argument("-s",
    "--save",
    action="store_true",
    help="Save subcategories to a file for quick re-runs"
    )
    parser.add_argument("-r",
    "--regen",
    action="store_true",
    help="Regenerate the subcategory file"
    )
    parser.add_argument("-v",
    "--verbose",
    action="store_true",
    help="Print debug lines"
    )
    parser.add_argument("-c",
    "--check",
    action="store_true",
    help="After finding page check to see that it truly " +
    "fits in category"
    )
    args = parser.parse_args()
    print_debug(str(args.check))
    DEBUGGING = args.verbose
    max_depth = args.tree_depth
    similarityVal = args.similarity
    if(args.save):
    print_debug("Saving!")
    if(args.regen):
    print_debug("Regenerating!")

    print("https://en.wikipedia.org/wiki/" + randomPage("Category:" +
    args.category,
    save=args.save,
    regen=args.regen,
    check=args.check
    )
    )


    Currently it runs fairly slowly, since it runs over networks via Python's requests module. Is there any way to make requests more efficient/faster or to get the Wikipedia subcategory tree locally (preferably in a small-ish format)?



    I'd appreciate any feedback on the style/structure of my code (e.g. readability, function/variable name, function structure) and any advice on the performance of the program itself.







    share|improve this question























      up vote
      5
      down vote

      favorite









      up vote
      5
      down vote

      favorite











      I recently wrote a Python script to generate a random page within a Wikipedia category and its subcategories:



      """Generate a random page from a wikipedia category."""
      import argparse
      import random

      import requests

      DEBUGGING = False
      max_depth = 4
      current_depth = 0
      header = "Garrett Credi's Random Page Bot(Contact @ gcc@ameritech.net)"
      headerVal = 'Api-User-Agent': header
      base_url = 'https://en.wikipedia.org/w/api.php'


      def print_debug(str):
      """Print strings if in debug/verbose mode mode."""
      global DEBUGGING
      if(DEBUGGING):
      print("DEBUG: " + str)


      def generateRequestsParams(category, mode):
      """Generate the params for requests given a category and a mode."""
      cmtype = ""
      if(mode == "Subcat"):
      cmtype = 'subcat'
      elif(mode == "Subpage"):
      cmtype = 'page'
      params =
      'format': 'json',
      'action': 'query',
      'list': 'categorymembers',
      'cmtitle': category,
      'cmlimit': 500,
      'cmtype': cmtype

      if(mode == "Pagecats"):
      params =
      'format': 'json',
      'action': 'query',
      'titles': category,
      'prop': 'categories'

      return params


      def wrappedRequest(category, mode):
      """Wrap a request to deal with connection errors."""
      global base_url
      params = generateRequestsParams(category, mode)
      global headerVal
      max_times = 5
      times = 0
      propertyString = 'categorymembers'
      while(times < max_times):
      try:
      r = requests.get(base_url, headers=headerVal, params=params)
      if(mode != "Pagecats"):
      return r.json()['query'][propertyString]
      else:
      for key in r.json()['query']['pages']:
      return r.json()['query']['pages'][key]['categories']
      except requests.exceptions.ConnectionError as e:
      if(times > max_times):
      print_debug("category failed too many times (times) " +
      " times. Moving on".format(
      category=category,
      times=times
      )
      )
      times = 0
      return [category]
      else:
      print_debug("Retrying category due to connection " +
      " error".format(
      cateogry=category
      )
      )
      times += 1


      def getSubcategories(category):
      """Get subcategories of a given subcategory."""
      global max_depth, DEBUGGING
      current_depth = 1
      singleStepSubcategories = [category]
      allSubcategories =
      while(current_depth <= max_depth):
      print_debug("Current tree depth d".format(d=current_depth))
      subcategoryTemp =
      if(len(singleStepSubcategories) == 0):
      break
      for subcat in singleStepSubcategories:
      allSubcategories.append(subcat)
      subcategories = wrappedRequest(subcat, mode="Subcat")
      for cat in subcategories:
      title = cat['title']
      print_debug("subcat has subcategory title".format(
      subcat=subcat,
      title=title
      )
      )
      if(title not in allSubcategories):
      allSubcategories.append(title)
      subcategoryTemp.append(title)
      else:
      print_debug("t already checked. Moving on".format(
      t=title
      )
      )
      singleStepSubcategories = subcategoryTemp
      current_depth += 1
      return allSubcategories


      def saveArray(category, subcats):
      """Save array to file."""
      filename = "category_subcats.txt".format(category=category)
      print_debug("Saving to f".format(f=filename))
      with open(filename, 'w') as f:
      for cat in subcats:
      f.write(cat+"n")


      def subcategoriesWithoutDuplicates(category):
      """Generate a list of subcategories without duplicates."""
      return set(getSubcategories(category))


      def retreiveSubcategoriesFromLocation(category):
      """Get subcategories from file, or generate them from scratch."""
      subCats =
      fileName = "category_subcats.txt".format(category=category)
      try:
      subCatFile = open(fileName, 'r')
      print_debug("Reading from filename".format(filename=fileName))
      for count, line in enumerate(subCatFile):
      subCats.append(line.replace("n", ""))
      subCatFile.close()
      except IOError as ioError:
      print_debug("fileName does not exist. Building from " +
      " network".format(fileName=fileName)
      )
      subCats = subcategoriesWithoutDuplicates(category)
      return subCats


      def checkPageSimilarity(page, subcategories):
      """Check the similarity of page to a list of subcategories.
      Verify if page truly is a subpage of a category.
      """
      global similarityVal
      pageCats = wrappedRequest(page, mode="Pagecats")
      points = 0.0
      # For every supercategory of page, if it is also in subcategories
      # the page is more likely to be a true subpage.
      for cat in pageCats:
      title = cat['title']
      if(title in subcategories):
      points += 1.0
      score = points/len(pageCats)
      print_debug("Score of p is s".format(p=page, s=str(score)))
      if(score >= similarityVal):
      return True
      return False


      def randomPage(category, save, regen, check):
      """Generate a random page from a category."""
      global DEBUGGING
      subCats =
      read = True
      if(not regen):
      subCats = retreiveSubcategoriesFromLocation(category)
      if(regen or (not read)):
      print_debug("Rebuilding category".format(category=category))
      subCats = subcategoriesWithoutDuplicates(category)
      if(save or regen):
      saveArray(category, subCats)
      randomPage = None
      validRandomPage = True
      cat = random.sample(subCats, 1)[0]
      print_debug("Chose category cat".format(cat=cat))
      pages = wrappedRequest(cat, mode="Subpage")
      while(not randomPage or not validRandomPage):
      try:
      randomPage = random.choice(pages)
      title = randomPage['title']
      if(check):
      print_debug("Checking " + title)
      validRandomPage = checkPageSimilarity(title, subCats)
      if(not validRandomPage):
      pages.remove(randomPage)
      except IndexError as a:
      print_debug("cat has no pages. Retrying".format(cat=cat))
      cat = random.sample(subCats, 1)[0]
      print_debug("Chose category cat".format(cat=cat))
      pages = wrappedRequest(cat, mode="Subpage")
      return randomPage['title']


      if(__name__ == "__main__"):
      parser = argparse.ArgumentParser(description='Get a random page from a ' +
      'wikipedia category')
      parser.add_argument('category', help="The category you wish to get a " +
      "page from."
      )
      parser.add_argument('--tree_depth',
      nargs='?',
      type=int,
      default=4,
      help="How far down to traverse the subcategory tree"
      )
      parser.add_argument('--similarity',
      nargs='?',
      type=float,
      default=.5,
      help="What percent of page categories need to be " +
      "in subcategory array. Must be used with -c/--check")
      parser.add_argument("-s",
      "--save",
      action="store_true",
      help="Save subcategories to a file for quick re-runs"
      )
      parser.add_argument("-r",
      "--regen",
      action="store_true",
      help="Regenerate the subcategory file"
      )
      parser.add_argument("-v",
      "--verbose",
      action="store_true",
      help="Print debug lines"
      )
      parser.add_argument("-c",
      "--check",
      action="store_true",
      help="After finding page check to see that it truly " +
      "fits in category"
      )
      args = parser.parse_args()
      print_debug(str(args.check))
      DEBUGGING = args.verbose
      max_depth = args.tree_depth
      similarityVal = args.similarity
      if(args.save):
      print_debug("Saving!")
      if(args.regen):
      print_debug("Regenerating!")

      print("https://en.wikipedia.org/wiki/" + randomPage("Category:" +
      args.category,
      save=args.save,
      regen=args.regen,
      check=args.check
      )
      )


      Currently it runs fairly slowly, since it runs over networks via Python's requests module. Is there any way to make requests more efficient/faster or to get the Wikipedia subcategory tree locally (preferably in a small-ish format)?



      I'd appreciate any feedback on the style/structure of my code (e.g. readability, function/variable name, function structure) and any advice on the performance of the program itself.







      share|improve this question













      I recently wrote a Python script to generate a random page within a Wikipedia category and its subcategories:



      """Generate a random page from a wikipedia category."""
      import argparse
      import random

      import requests

      DEBUGGING = False
      max_depth = 4
      current_depth = 0
      header = "Garrett Credi's Random Page Bot(Contact @ gcc@ameritech.net)"
      headerVal = 'Api-User-Agent': header
      base_url = 'https://en.wikipedia.org/w/api.php'


      def print_debug(str):
      """Print strings if in debug/verbose mode mode."""
      global DEBUGGING
      if(DEBUGGING):
      print("DEBUG: " + str)


      def generateRequestsParams(category, mode):
      """Generate the params for requests given a category and a mode."""
      cmtype = ""
      if(mode == "Subcat"):
      cmtype = 'subcat'
      elif(mode == "Subpage"):
      cmtype = 'page'
      params =
      'format': 'json',
      'action': 'query',
      'list': 'categorymembers',
      'cmtitle': category,
      'cmlimit': 500,
      'cmtype': cmtype

      if(mode == "Pagecats"):
      params =
      'format': 'json',
      'action': 'query',
      'titles': category,
      'prop': 'categories'

      return params


      def wrappedRequest(category, mode):
      """Wrap a request to deal with connection errors."""
      global base_url
      params = generateRequestsParams(category, mode)
      global headerVal
      max_times = 5
      times = 0
      propertyString = 'categorymembers'
      while(times < max_times):
      try:
      r = requests.get(base_url, headers=headerVal, params=params)
      if(mode != "Pagecats"):
      return r.json()['query'][propertyString]
      else:
      for key in r.json()['query']['pages']:
      return r.json()['query']['pages'][key]['categories']
      except requests.exceptions.ConnectionError as e:
      if(times > max_times):
      print_debug("category failed too many times (times) " +
      " times. Moving on".format(
      category=category,
      times=times
      )
      )
      times = 0
      return [category]
      else:
      print_debug("Retrying category due to connection " +
      " error".format(
      cateogry=category
      )
      )
      times += 1


      def getSubcategories(category):
      """Get subcategories of a given subcategory."""
      global max_depth, DEBUGGING
      current_depth = 1
      singleStepSubcategories = [category]
      allSubcategories =
      while(current_depth <= max_depth):
      print_debug("Current tree depth d".format(d=current_depth))
      subcategoryTemp =
      if(len(singleStepSubcategories) == 0):
      break
      for subcat in singleStepSubcategories:
      allSubcategories.append(subcat)
      subcategories = wrappedRequest(subcat, mode="Subcat")
      for cat in subcategories:
      title = cat['title']
      print_debug("subcat has subcategory title".format(
      subcat=subcat,
      title=title
      )
      )
      if(title not in allSubcategories):
      allSubcategories.append(title)
      subcategoryTemp.append(title)
      else:
      print_debug("t already checked. Moving on".format(
      t=title
      )
      )
      singleStepSubcategories = subcategoryTemp
      current_depth += 1
      return allSubcategories


      def saveArray(category, subcats):
      """Save array to file."""
      filename = "category_subcats.txt".format(category=category)
      print_debug("Saving to f".format(f=filename))
      with open(filename, 'w') as f:
      for cat in subcats:
      f.write(cat+"n")


      def subcategoriesWithoutDuplicates(category):
      """Generate a list of subcategories without duplicates."""
      return set(getSubcategories(category))


      def retreiveSubcategoriesFromLocation(category):
      """Get subcategories from file, or generate them from scratch."""
      subCats =
      fileName = "category_subcats.txt".format(category=category)
      try:
      subCatFile = open(fileName, 'r')
      print_debug("Reading from filename".format(filename=fileName))
      for count, line in enumerate(subCatFile):
      subCats.append(line.replace("n", ""))
      subCatFile.close()
      except IOError as ioError:
      print_debug("fileName does not exist. Building from " +
      " network".format(fileName=fileName)
      )
      subCats = subcategoriesWithoutDuplicates(category)
      return subCats


      def checkPageSimilarity(page, subcategories):
      """Check the similarity of page to a list of subcategories.
      Verify if page truly is a subpage of a category.
      """
      global similarityVal
      pageCats = wrappedRequest(page, mode="Pagecats")
      points = 0.0
      # For every supercategory of page, if it is also in subcategories
      # the page is more likely to be a true subpage.
      for cat in pageCats:
      title = cat['title']
      if(title in subcategories):
      points += 1.0
      score = points/len(pageCats)
      print_debug("Score of p is s".format(p=page, s=str(score)))
      if(score >= similarityVal):
      return True
      return False


      def randomPage(category, save, regen, check):
      """Generate a random page from a category."""
      global DEBUGGING
      subCats =
      read = True
      if(not regen):
      subCats = retreiveSubcategoriesFromLocation(category)
      if(regen or (not read)):
      print_debug("Rebuilding category".format(category=category))
      subCats = subcategoriesWithoutDuplicates(category)
      if(save or regen):
      saveArray(category, subCats)
      randomPage = None
      validRandomPage = True
      cat = random.sample(subCats, 1)[0]
      print_debug("Chose category cat".format(cat=cat))
      pages = wrappedRequest(cat, mode="Subpage")
      while(not randomPage or not validRandomPage):
      try:
      randomPage = random.choice(pages)
      title = randomPage['title']
      if(check):
      print_debug("Checking " + title)
      validRandomPage = checkPageSimilarity(title, subCats)
      if(not validRandomPage):
      pages.remove(randomPage)
      except IndexError as a:
      print_debug("cat has no pages. Retrying".format(cat=cat))
      cat = random.sample(subCats, 1)[0]
      print_debug("Chose category cat".format(cat=cat))
      pages = wrappedRequest(cat, mode="Subpage")
      return randomPage['title']


      if(__name__ == "__main__"):
      parser = argparse.ArgumentParser(description='Get a random page from a ' +
      'wikipedia category')
      parser.add_argument('category', help="The category you wish to get a " +
      "page from."
      )
      parser.add_argument('--tree_depth',
      nargs='?',
      type=int,
      default=4,
      help="How far down to traverse the subcategory tree"
      )
      parser.add_argument('--similarity',
      nargs='?',
      type=float,
      default=.5,
      help="What percent of page categories need to be " +
      "in subcategory array. Must be used with -c/--check")
      parser.add_argument("-s",
      "--save",
      action="store_true",
      help="Save subcategories to a file for quick re-runs"
      )
      parser.add_argument("-r",
      "--regen",
      action="store_true",
      help="Regenerate the subcategory file"
      )
      parser.add_argument("-v",
      "--verbose",
      action="store_true",
      help="Print debug lines"
      )
      parser.add_argument("-c",
      "--check",
      action="store_true",
      help="After finding page check to see that it truly " +
      "fits in category"
      )
      args = parser.parse_args()
      print_debug(str(args.check))
      DEBUGGING = args.verbose
      max_depth = args.tree_depth
      similarityVal = args.similarity
      if(args.save):
      print_debug("Saving!")
      if(args.regen):
      print_debug("Regenerating!")

      print("https://en.wikipedia.org/wiki/" + randomPage("Category:" +
      args.category,
      save=args.save,
      regen=args.regen,
      check=args.check
      )
      )


      Currently it runs fairly slowly, since it runs over networks via Python's requests module. Is there any way to make requests more efficient/faster or to get the Wikipedia subcategory tree locally (preferably in a small-ish format)?



      I'd appreciate any feedback on the style/structure of my code (e.g. readability, function/variable name, function structure) and any advice on the performance of the program itself.









      share|improve this question












      share|improve this question




      share|improve this question








      edited May 4 at 12:38









      Daniel

      4,1132836




      4,1132836









      asked May 2 at 2:20









      Garrett Credi

      363




      363




















          1 Answer
          1






          active

          oldest

          votes

















          up vote
          2
          down vote













          General




          1. In print_debug(), you don't have to use the global keyword to refer to DEBUGGING. If the Python interpreter can't find the name DEBUGGING locally, it will then search for it globally. If it still can't find it, a NameError is raised. The only two reasons to use global are:



            1. When you have a local and a global variable with the same name, and you explicitly want to refer to the global variable;


            2. In local scope, when you need to (re)assign to a global variable.



          2. In print_debug(), you're shadowing the built-in str. To avoid shadowing a variable, by convention, you should add a trailing underscore (as in str_). If you find that ugly, you can also spell it out, or abbreviate further: string or s (the former is more desirable).


          3. You don't need parentheses around if-statements and while-statements.



          4. The idiomatic way of checking if a container is empty in Python is to directly use it in an if-statement, in this fashion:



            if not container:
            # Container is empty


            ... this works because a container's __bool__() method returns True if it is not empty, and False otherwise.



          5. When catching an exception, if you don't need access to the exception instance itself, you should leave out the as ... part.



          6. If you do need access to the exception instance, most people use:



            except <exception type> as exc:


            ... or:



            except <exception type> as err:



          7. The following:



            if <boolean expression>:
            return True
            return False


            ... can be shortened to:



            return <boolean expression>



          8. In randomPage(), the following:



            if(regen or (not read)):


            ... can be simplified to become:



            if regen or not read:



          9. Avoid global variables. They are a telltale sign of a design problem in your code. Global constants are acceptable, but non-constant global variables can cause all kinds of trouble:



            1. It's hard to track where they are being used and modified. This problem becomes very prominent when using threads;


            2. Loading a global variable is more costly than loading a local one;


            3. If you design an API and have lots of global variables floating about, when someone performs a wildcard import, their global namespace will be cluttered.




          10. Generally, if you find yourself doing this:



            my_list = 
            for x in some_container:
            if some_condition_applies(x):
            my_list.append(x)


            ... a list comprehension would be a good fit:



            my_list = [x for x in some_container if some_condition_applies(x)]


            A list comprehensions is shorter and often faster than its for-loop counterpart.




          11. Debug messages should be sent to stderr, not stdout:



            import sys

            ...

            print("DEBUG: " + str, file=sys.stderr)


          PEP-8



          PEP-8 is the name of the official Python style guide. You violated it a couple of times:




          1. Your indentation is all over the place. Sorry, but it had to be said. Take this excerpt:



             print_debug("t already checked. Moving on".format(
            t=title
            )
            )


            ... isn't that hard to read? Something like this would be much easier on the eyes:



            print_debug(
            "t already checked. Moving on".format(t=title)
            )


            ... or this:



            print_debug("t already checked. Moving on".format(
            t=title)
            )


            Ultimately, it's up to you. PEP-8 also lists some examples of acceptable styles.1



          2. Use snake_case for function and variable names. Only use UPPERCASE_WITH_UNDERSCORES for constants.2


          3. Don't mix single and double quotes. I prefer double quotes, because they're less likely to cause clashes with quotation marks in flowing text.3


          4. Limit the line length to 79 characters.4


          Docstrings




          1. Good job on adding docstrings to your functions and to the module itself. Most people (including me) often can't bring up the effort to add documentation, but you have! :) If you want to help yourself understand the code in 6 months' time, or if you want to publish this as an API, you should be a bit more thorough, for instance by adding information about:



            • What arguments the function takes, what type they should be, and exactly what they convey;


            • The return type of the value;


            • Any special cases the caller should be aware of.



            There's even a style guide for docstrings: PEP-257.



          Performance



          I don't currently have the time to do a full review on performance, but I suggest you run a profiler to see where you can optimize. The problem is very likely to be I/O-bound, but the debugging can have a noticable performance hit.



          If you want to squeeze a little bit of extra speed out of requests, you can use a requests.Session object, which:




          ... [omitted] will use urllib3's connection pooling. So if you're making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase ... [omitted]




          References



          1 PEP-8: Indentation



          2 PEP-8: Function and variable names



          3 PEP-8: String Quotes



          4 PEP-8: Maximum Line Length






          share|improve this answer























            Your Answer




            StackExchange.ifUsing("editor", function ()
            return StackExchange.using("mathjaxEditing", function ()
            StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
            StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
            );
            );
            , "mathjax-editing");

            StackExchange.ifUsing("editor", function ()
            StackExchange.using("externalEditor", function ()
            StackExchange.using("snippets", function ()
            StackExchange.snippets.init();
            );
            );
            , "code-snippets");

            StackExchange.ready(function()
            var channelOptions =
            tags: "".split(" "),
            id: "196"
            ;
            initTagRenderer("".split(" "), "".split(" "), channelOptions);

            StackExchange.using("externalEditor", function()
            // Have to fire editor after snippets, if snippets enabled
            if (StackExchange.settings.snippets.snippetsEnabled)
            StackExchange.using("snippets", function()
            createEditor();
            );

            else
            createEditor();

            );

            function createEditor()
            StackExchange.prepareEditor(
            heartbeatType: 'answer',
            convertImagesToLinks: false,
            noModals: false,
            showLowRepImageUploadWarning: true,
            reputationToPostImages: null,
            bindNavPrevention: true,
            postfix: "",
            onDemand: true,
            discardSelector: ".discard-answer"
            ,immediatelyShowMarkdownHelp:true
            );



            );








             

            draft saved


            draft discarded


















            StackExchange.ready(
            function ()
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f193411%2fwikipedia-random-page-in-category-bot%23new-answer', 'question_page');

            );

            Post as a guest






























            1 Answer
            1






            active

            oldest

            votes








            1 Answer
            1






            active

            oldest

            votes









            active

            oldest

            votes






            active

            oldest

            votes








            up vote
            2
            down vote













            General




            1. In print_debug(), you don't have to use the global keyword to refer to DEBUGGING. If the Python interpreter can't find the name DEBUGGING locally, it will then search for it globally. If it still can't find it, a NameError is raised. The only two reasons to use global are:



              1. When you have a local and a global variable with the same name, and you explicitly want to refer to the global variable;


              2. In local scope, when you need to (re)assign to a global variable.



            2. In print_debug(), you're shadowing the built-in str. To avoid shadowing a variable, by convention, you should add a trailing underscore (as in str_). If you find that ugly, you can also spell it out, or abbreviate further: string or s (the former is more desirable).


            3. You don't need parentheses around if-statements and while-statements.



            4. The idiomatic way of checking if a container is empty in Python is to directly use it in an if-statement, in this fashion:



              if not container:
              # Container is empty


              ... this works because a container's __bool__() method returns True if it is not empty, and False otherwise.



            5. When catching an exception, if you don't need access to the exception instance itself, you should leave out the as ... part.



            6. If you do need access to the exception instance, most people use:



              except <exception type> as exc:


              ... or:



              except <exception type> as err:



            7. The following:



              if <boolean expression>:
              return True
              return False


              ... can be shortened to:



              return <boolean expression>



            8. In randomPage(), the following:



              if(regen or (not read)):


              ... can be simplified to become:



              if regen or not read:



            9. Avoid global variables. They are a telltale sign of a design problem in your code. Global constants are acceptable, but non-constant global variables can cause all kinds of trouble:



              1. It's hard to track where they are being used and modified. This problem becomes very prominent when using threads;


              2. Loading a global variable is more costly than loading a local one;


              3. If you design an API and have lots of global variables floating about, when someone performs a wildcard import, their global namespace will be cluttered.




            10. Generally, if you find yourself doing this:



              my_list = 
              for x in some_container:
              if some_condition_applies(x):
              my_list.append(x)


              ... a list comprehension would be a good fit:



              my_list = [x for x in some_container if some_condition_applies(x)]


              A list comprehensions is shorter and often faster than its for-loop counterpart.




            11. Debug messages should be sent to stderr, not stdout:



              import sys

              ...

              print("DEBUG: " + str, file=sys.stderr)


            PEP-8



            PEP-8 is the name of the official Python style guide. You violated it a couple of times:




            1. Your indentation is all over the place. Sorry, but it had to be said. Take this excerpt:



               print_debug("t already checked. Moving on".format(
              t=title
              )
              )


              ... isn't that hard to read? Something like this would be much easier on the eyes:



              print_debug(
              "t already checked. Moving on".format(t=title)
              )


              ... or this:



              print_debug("t already checked. Moving on".format(
              t=title)
              )


              Ultimately, it's up to you. PEP-8 also lists some examples of acceptable styles.1



            2. Use snake_case for function and variable names. Only use UPPERCASE_WITH_UNDERSCORES for constants.2


            3. Don't mix single and double quotes. I prefer double quotes, because they're less likely to cause clashes with quotation marks in flowing text.3


            4. Limit the line length to 79 characters.4


            Docstrings




            1. Good job on adding docstrings to your functions and to the module itself. Most people (including me) often can't bring up the effort to add documentation, but you have! :) If you want to help yourself understand the code in 6 months' time, or if you want to publish this as an API, you should be a bit more thorough, for instance by adding information about:



              • What arguments the function takes, what type they should be, and exactly what they convey;


              • The return type of the value;


              • Any special cases the caller should be aware of.



              There's even a style guide for docstrings: PEP-257.



            Performance



            I don't currently have the time to do a full review on performance, but I suggest you run a profiler to see where you can optimize. The problem is very likely to be I/O-bound, but the debugging can have a noticable performance hit.



            If you want to squeeze a little bit of extra speed out of requests, you can use a requests.Session object, which:




            ... [omitted] will use urllib3's connection pooling. So if you're making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase ... [omitted]




            References



            1 PEP-8: Indentation



            2 PEP-8: Function and variable names



            3 PEP-8: String Quotes



            4 PEP-8: Maximum Line Length






            share|improve this answer



























              up vote
              2
              down vote













              General




              1. In print_debug(), you don't have to use the global keyword to refer to DEBUGGING. If the Python interpreter can't find the name DEBUGGING locally, it will then search for it globally. If it still can't find it, a NameError is raised. The only two reasons to use global are:



                1. When you have a local and a global variable with the same name, and you explicitly want to refer to the global variable;


                2. In local scope, when you need to (re)assign to a global variable.



              2. In print_debug(), you're shadowing the built-in str. To avoid shadowing a variable, by convention, you should add a trailing underscore (as in str_). If you find that ugly, you can also spell it out, or abbreviate further: string or s (the former is more desirable).


              3. You don't need parentheses around if-statements and while-statements.



              4. The idiomatic way of checking if a container is empty in Python is to directly use it in an if-statement, in this fashion:



                if not container:
                # Container is empty


                ... this works because a container's __bool__() method returns True if it is not empty, and False otherwise.



              5. When catching an exception, if you don't need access to the exception instance itself, you should leave out the as ... part.



              6. If you do need access to the exception instance, most people use:



                except <exception type> as exc:


                ... or:



                except <exception type> as err:



              7. The following:



                if <boolean expression>:
                return True
                return False


                ... can be shortened to:



                return <boolean expression>



              8. In randomPage(), the following:



                if(regen or (not read)):


                ... can be simplified to become:



                if regen or not read:



              9. Avoid global variables. They are a telltale sign of a design problem in your code. Global constants are acceptable, but non-constant global variables can cause all kinds of trouble:



                1. It's hard to track where they are being used and modified. This problem becomes very prominent when using threads;


                2. Loading a global variable is more costly than loading a local one;


                3. If you design an API and have lots of global variables floating about, when someone performs a wildcard import, their global namespace will be cluttered.




              10. Generally, if you find yourself doing this:



                my_list = 
                for x in some_container:
                if some_condition_applies(x):
                my_list.append(x)


                ... a list comprehension would be a good fit:



                my_list = [x for x in some_container if some_condition_applies(x)]


                A list comprehensions is shorter and often faster than its for-loop counterpart.




              11. Debug messages should be sent to stderr, not stdout:



                import sys

                ...

                print("DEBUG: " + str, file=sys.stderr)


              PEP-8



              PEP-8 is the name of the official Python style guide. You violated it a couple of times:




              1. Your indentation is all over the place. Sorry, but it had to be said. Take this excerpt:



                 print_debug("t already checked. Moving on".format(
                t=title
                )
                )


                ... isn't that hard to read? Something like this would be much easier on the eyes:



                print_debug(
                "t already checked. Moving on".format(t=title)
                )


                ... or this:



                print_debug("t already checked. Moving on".format(
                t=title)
                )


                Ultimately, it's up to you. PEP-8 also lists some examples of acceptable styles.1



              2. Use snake_case for function and variable names. Only use UPPERCASE_WITH_UNDERSCORES for constants.2


              3. Don't mix single and double quotes. I prefer double quotes, because they're less likely to cause clashes with quotation marks in flowing text.3


              4. Limit the line length to 79 characters.4


              Docstrings




              1. Good job on adding docstrings to your functions and to the module itself. Most people (including me) often can't bring up the effort to add documentation, but you have! :) If you want to help yourself understand the code in 6 months' time, or if you want to publish this as an API, you should be a bit more thorough, for instance by adding information about:



                • What arguments the function takes, what type they should be, and exactly what they convey;


                • The return type of the value;


                • Any special cases the caller should be aware of.



                There's even a style guide for docstrings: PEP-257.



              Performance



              I don't currently have the time to do a full review on performance, but I suggest you run a profiler to see where you can optimize. The problem is very likely to be I/O-bound, but the debugging can have a noticable performance hit.



              If you want to squeeze a little bit of extra speed out of requests, you can use a requests.Session object, which:




              ... [omitted] will use urllib3's connection pooling. So if you're making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase ... [omitted]




              References



              1 PEP-8: Indentation



              2 PEP-8: Function and variable names



              3 PEP-8: String Quotes



              4 PEP-8: Maximum Line Length






              share|improve this answer

























                up vote
                2
                down vote










                up vote
                2
                down vote









                General




                1. In print_debug(), you don't have to use the global keyword to refer to DEBUGGING. If the Python interpreter can't find the name DEBUGGING locally, it will then search for it globally. If it still can't find it, a NameError is raised. The only two reasons to use global are:



                  1. When you have a local and a global variable with the same name, and you explicitly want to refer to the global variable;


                  2. In local scope, when you need to (re)assign to a global variable.



                2. In print_debug(), you're shadowing the built-in str. To avoid shadowing a variable, by convention, you should add a trailing underscore (as in str_). If you find that ugly, you can also spell it out, or abbreviate further: string or s (the former is more desirable).


                3. You don't need parentheses around if-statements and while-statements.



                4. The idiomatic way of checking if a container is empty in Python is to directly use it in an if-statement, in this fashion:



                  if not container:
                  # Container is empty


                  ... this works because a container's __bool__() method returns True if it is not empty, and False otherwise.



                5. When catching an exception, if you don't need access to the exception instance itself, you should leave out the as ... part.



                6. If you do need access to the exception instance, most people use:



                  except <exception type> as exc:


                  ... or:



                  except <exception type> as err:



                7. The following:



                  if <boolean expression>:
                  return True
                  return False


                  ... can be shortened to:



                  return <boolean expression>



                8. In randomPage(), the following:



                  if(regen or (not read)):


                  ... can be simplified to become:



                  if regen or not read:



                9. Avoid global variables. They are a telltale sign of a design problem in your code. Global constants are acceptable, but non-constant global variables can cause all kinds of trouble:



                  1. It's hard to track where they are being used and modified. This problem becomes very prominent when using threads;


                  2. Loading a global variable is more costly than loading a local one;


                  3. If you design an API and have lots of global variables floating about, when someone performs a wildcard import, their global namespace will be cluttered.




                10. Generally, if you find yourself doing this:



                  my_list = 
                  for x in some_container:
                  if some_condition_applies(x):
                  my_list.append(x)


                  ... a list comprehension would be a good fit:



                  my_list = [x for x in some_container if some_condition_applies(x)]


                  A list comprehensions is shorter and often faster than its for-loop counterpart.




                11. Debug messages should be sent to stderr, not stdout:



                  import sys

                  ...

                  print("DEBUG: " + str, file=sys.stderr)


                PEP-8



                PEP-8 is the name of the official Python style guide. You violated it a couple of times:




                1. Your indentation is all over the place. Sorry, but it had to be said. Take this excerpt:



                   print_debug("t already checked. Moving on".format(
                  t=title
                  )
                  )


                  ... isn't that hard to read? Something like this would be much easier on the eyes:



                  print_debug(
                  "t already checked. Moving on".format(t=title)
                  )


                  ... or this:



                  print_debug("t already checked. Moving on".format(
                  t=title)
                  )


                  Ultimately, it's up to you. PEP-8 also lists some examples of acceptable styles.1



                2. Use snake_case for function and variable names. Only use UPPERCASE_WITH_UNDERSCORES for constants.2


                3. Don't mix single and double quotes. I prefer double quotes, because they're less likely to cause clashes with quotation marks in flowing text.3


                4. Limit the line length to 79 characters.4


                Docstrings




                1. Good job on adding docstrings to your functions and to the module itself. Most people (including me) often can't bring up the effort to add documentation, but you have! :) If you want to help yourself understand the code in 6 months' time, or if you want to publish this as an API, you should be a bit more thorough, for instance by adding information about:



                  • What arguments the function takes, what type they should be, and exactly what they convey;


                  • The return type of the value;


                  • Any special cases the caller should be aware of.



                  There's even a style guide for docstrings: PEP-257.



                Performance



                I don't currently have the time to do a full review on performance, but I suggest you run a profiler to see where you can optimize. The problem is very likely to be I/O-bound, but the debugging can have a noticable performance hit.



                If you want to squeeze a little bit of extra speed out of requests, you can use a requests.Session object, which:




                ... [omitted] will use urllib3's connection pooling. So if you're making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase ... [omitted]




                References



                1 PEP-8: Indentation



                2 PEP-8: Function and variable names



                3 PEP-8: String Quotes



                4 PEP-8: Maximum Line Length






                share|improve this answer















                General




                1. In print_debug(), you don't have to use the global keyword to refer to DEBUGGING. If the Python interpreter can't find the name DEBUGGING locally, it will then search for it globally. If it still can't find it, a NameError is raised. The only two reasons to use global are:



                  1. When you have a local and a global variable with the same name, and you explicitly want to refer to the global variable;


                  2. In local scope, when you need to (re)assign to a global variable.



                2. In print_debug(), you're shadowing the built-in str. To avoid shadowing a variable, by convention, you should add a trailing underscore (as in str_). If you find that ugly, you can also spell it out, or abbreviate further: string or s (the former is more desirable).


                3. You don't need parentheses around if-statements and while-statements.



                4. The idiomatic way of checking if a container is empty in Python is to directly use it in an if-statement, in this fashion:



                  if not container:
                  # Container is empty


                  ... this works because a container's __bool__() method returns True if it is not empty, and False otherwise.



                5. When catching an exception, if you don't need access to the exception instance itself, you should leave out the as ... part.



                6. If you do need access to the exception instance, most people use:



                  except <exception type> as exc:


                  ... or:



                  except <exception type> as err:



                7. The following:



                  if <boolean expression>:
                  return True
                  return False


                  ... can be shortened to:



                  return <boolean expression>



                8. In randomPage(), the following:



                  if(regen or (not read)):


                  ... can be simplified to become:



                  if regen or not read:



                9. Avoid global variables. They are a telltale sign of a design problem in your code. Global constants are acceptable, but non-constant global variables can cause all kinds of trouble:



                  1. It's hard to track where they are being used and modified. This problem becomes very prominent when using threads;


                  2. Loading a global variable is more costly than loading a local one;


                  3. If you design an API and have lots of global variables floating about, when someone performs a wildcard import, their global namespace will be cluttered.




                10. Generally, if you find yourself doing this:



                  my_list = 
                  for x in some_container:
                  if some_condition_applies(x):
                  my_list.append(x)


                  ... a list comprehension would be a good fit:



                  my_list = [x for x in some_container if some_condition_applies(x)]


                  A list comprehensions is shorter and often faster than its for-loop counterpart.




                11. Debug messages should be sent to stderr, not stdout:



                  import sys

                  ...

                  print("DEBUG: " + str, file=sys.stderr)


                PEP-8



                PEP-8 is the name of the official Python style guide. You violated it a couple of times:




                1. Your indentation is all over the place. Sorry, but it had to be said. Take this excerpt:



                   print_debug("t already checked. Moving on".format(
                  t=title
                  )
                  )


                  ... isn't that hard to read? Something like this would be much easier on the eyes:



                  print_debug(
                  "t already checked. Moving on".format(t=title)
                  )


                  ... or this:



                  print_debug("t already checked. Moving on".format(
                  t=title)
                  )


                  Ultimately, it's up to you. PEP-8 also lists some examples of acceptable styles.1



                2. Use snake_case for function and variable names. Only use UPPERCASE_WITH_UNDERSCORES for constants.2


                3. Don't mix single and double quotes. I prefer double quotes, because they're less likely to cause clashes with quotation marks in flowing text.3


                4. Limit the line length to 79 characters.4


                Docstrings




                1. Good job on adding docstrings to your functions and to the module itself. Most people (including me) often can't bring up the effort to add documentation, but you have! :) If you want to help yourself understand the code in 6 months' time, or if you want to publish this as an API, you should be a bit more thorough, for instance by adding information about:



                  • What arguments the function takes, what type they should be, and exactly what they convey;


                  • The return type of the value;


                  • Any special cases the caller should be aware of.



                  There's even a style guide for docstrings: PEP-257.



                Performance



                I don't currently have the time to do a full review on performance, but I suggest you run a profiler to see where you can optimize. The problem is very likely to be I/O-bound, but the debugging can have a noticable performance hit.



                If you want to squeeze a little bit of extra speed out of requests, you can use a requests.Session object, which:




                ... [omitted] will use urllib3's connection pooling. So if you're making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase ... [omitted]




                References



                1 PEP-8: Indentation



                2 PEP-8: Function and variable names



                3 PEP-8: String Quotes



                4 PEP-8: Maximum Line Length







                share|improve this answer















                share|improve this answer



                share|improve this answer








                edited May 4 at 11:27


























                answered May 4 at 10:34









                Daniel

                4,1132836




                4,1132836






















                     

                    draft saved


                    draft discarded


























                     


                    draft saved


                    draft discarded














                    StackExchange.ready(
                    function ()
                    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f193411%2fwikipedia-random-page-in-category-bot%23new-answer', 'question_page');

                    );

                    Post as a guest













































































                    Popular posts from this blog

                    Greedy Best First Search implementation in Rust

                    Function to Return a JSON Like Objects Using VBA Collections and Arrays

                    C++11 CLH Lock Implementation