Crawls a site, repeat 7 times and gets name at final page

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
0
down vote

favorite

Python 3.x: Below code crawls the site http://example.com/name_Potter.html and always selects the 'a' tag at 3rd position for example <li style="margin-top: 19px;"><a href="http://example.com/name_Yusuf.html">Yusuf</a></li>. it stores the link, its string and finally returns the text of final page. The code works and gives correct result. I am curious if this logic is correct and if it needs improvement

from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
url = 'http://example.com/name_Potter.html'
def scrape(url):
 result = 
 html = urllib.request.urlopen(url).read()
 soup = BeautifulSoup(html, 'html.parser')
 atag = soup.findAll("a")
 print(atag)
 result.append(atag[2].get('href'))
 result.append(atag[2].string)
 return result

init=0
namelist = 
for i in range(0,7):
 if init == 0:
 result=scrape(url)
 init=init+1
 namelist.append(result[1])
 newurl = result[0]
 continue
 result = scrape(newurl)
 namelist.append(result[1])
 newurl = result[0]

print(namelist)

Sample page:-

<html>
<head>
<title>contacts</title>
</head>
<body>
<h1>People that Codie knows</h1>
<div class="example" >
<center>
 Hello world
</center>
</div>
<ul>
<li style="margin-top: 5px;"><a href="http://example.com/name_Kalvyn.html">Kalvyn</a></li>
<li style="margin-top: 15px;"><a href="http://example.com/name_Zhi.html">Zhi</a></li>
<li style="margin-top: 19px;"><a href="http://example.com/name_Yusuf.html">Yusuf</a></li>
<li style="margin-top: 12px;"><a href="http://example.com/name_Andrei.html">Andrei</a></li>
<li style="margin-top: 21px;"><a href="http://example.com/name_Roba.html">Roba</a></li>
<li style="margin-top: 4px;"><a href="http://example.com/name_Aya.html">Aya</a></li>
<li style="margin-top: 12px;"><a href="http://example.com/name_Fynlay.html">Fynlay</a></li>
<li style="margin-top: 32px;"><a href="http://example.com/name_Tahlia.html">Tahlia</a></li>
<li style="margin-top: 3px;"><a href="http://example.com/name_Beatrice.html">Beatrice</a></li>
<li style="margin-top: 15px;"><a href="http://example.com/name_Rhona.html">Rhona</a></li>
<li style="margin-top: 11px;"><a href="http://example.com/name_Dean.html">Dean</a></li>
<li style="margin-top: 25px;"><a href="http://example.com/name_Cassidy.html">Cassidy</a></li>
<li style="margin-top: 17px;"><a href="http://example.com/name_Charlie.html">Charlie</a></li>
</ul>
</body>
</html>

asked Jan 27 at 16:00

satch_boogie

1948

add a commentÂ |Â

up vote
0
down vote

favorite

from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
url = 'http://example.com/name_Potter.html'
def scrape(url):
 result = 
 html = urllib.request.urlopen(url).read()
 soup = BeautifulSoup(html, 'html.parser')
 atag = soup.findAll("a")
 print(atag)
 result.append(atag[2].get('href'))
 result.append(atag[2].string)
 return result

init=0
namelist = 
for i in range(0,7):
 if init == 0:
 result=scrape(url)
 init=init+1
 namelist.append(result[1])
 newurl = result[0]
 continue
 result = scrape(newurl)
 namelist.append(result[1])
 newurl = result[0]

print(namelist)

Sample page:-

<html>
<head>
<title>contacts</title>
</head>
<body>
<h1>People that Codie knows</h1>
<div class="example" >
<center>
 Hello world
</center>
</div>
<ul>
<li style="margin-top: 5px;"><a href="http://example.com/name_Kalvyn.html">Kalvyn</a></li>
<li style="margin-top: 15px;"><a href="http://example.com/name_Zhi.html">Zhi</a></li>
<li style="margin-top: 19px;"><a href="http://example.com/name_Yusuf.html">Yusuf</a></li>
<li style="margin-top: 12px;"><a href="http://example.com/name_Andrei.html">Andrei</a></li>
<li style="margin-top: 21px;"><a href="http://example.com/name_Roba.html">Roba</a></li>
<li style="margin-top: 4px;"><a href="http://example.com/name_Aya.html">Aya</a></li>
<li style="margin-top: 12px;"><a href="http://example.com/name_Fynlay.html">Fynlay</a></li>
<li style="margin-top: 32px;"><a href="http://example.com/name_Tahlia.html">Tahlia</a></li>
<li style="margin-top: 3px;"><a href="http://example.com/name_Beatrice.html">Beatrice</a></li>
<li style="margin-top: 15px;"><a href="http://example.com/name_Rhona.html">Rhona</a></li>
<li style="margin-top: 11px;"><a href="http://example.com/name_Dean.html">Dean</a></li>
<li style="margin-top: 25px;"><a href="http://example.com/name_Cassidy.html">Cassidy</a></li>
<li style="margin-top: 17px;"><a href="http://example.com/name_Charlie.html">Charlie</a></li>
</ul>
</body>
</html>

asked Jan 27 at 16:00

satch_boogie

1948

add a commentÂ |Â

up vote
0
down vote

favorite

from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
url = 'http://example.com/name_Potter.html'
def scrape(url):
 result = 
 html = urllib.request.urlopen(url).read()
 soup = BeautifulSoup(html, 'html.parser')
 atag = soup.findAll("a")
 print(atag)
 result.append(atag[2].get('href'))
 result.append(atag[2].string)
 return result

init=0
namelist = 
for i in range(0,7):
 if init == 0:
 result=scrape(url)
 init=init+1
 namelist.append(result[1])
 newurl = result[0]
 continue
 result = scrape(newurl)
 namelist.append(result[1])
 newurl = result[0]

print(namelist)

Sample page:-

<html>
<head>
<title>contacts</title>
</head>
<body>
<h1>People that Codie knows</h1>
<div class="example" >
<center>
 Hello world
</center>
</div>
<ul>
<li style="margin-top: 5px;"><a href="http://example.com/name_Kalvyn.html">Kalvyn</a></li>
<li style="margin-top: 15px;"><a href="http://example.com/name_Zhi.html">Zhi</a></li>
<li style="margin-top: 19px;"><a href="http://example.com/name_Yusuf.html">Yusuf</a></li>
<li style="margin-top: 12px;"><a href="http://example.com/name_Andrei.html">Andrei</a></li>
<li style="margin-top: 21px;"><a href="http://example.com/name_Roba.html">Roba</a></li>
<li style="margin-top: 4px;"><a href="http://example.com/name_Aya.html">Aya</a></li>
<li style="margin-top: 12px;"><a href="http://example.com/name_Fynlay.html">Fynlay</a></li>
<li style="margin-top: 32px;"><a href="http://example.com/name_Tahlia.html">Tahlia</a></li>
<li style="margin-top: 3px;"><a href="http://example.com/name_Beatrice.html">Beatrice</a></li>
<li style="margin-top: 15px;"><a href="http://example.com/name_Rhona.html">Rhona</a></li>
<li style="margin-top: 11px;"><a href="http://example.com/name_Dean.html">Dean</a></li>
<li style="margin-top: 25px;"><a href="http://example.com/name_Cassidy.html">Cassidy</a></li>
<li style="margin-top: 17px;"><a href="http://example.com/name_Charlie.html">Charlie</a></li>
</ul>
</body>
</html>

asked Jan 27 at 16:00

satch_boogie

1948

from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
url = 'http://example.com/name_Potter.html'
def scrape(url):
 result = 
 html = urllib.request.urlopen(url).read()
 soup = BeautifulSoup(html, 'html.parser')
 atag = soup.findAll("a")
 print(atag)
 result.append(atag[2].get('href'))
 result.append(atag[2].string)
 return result

init=0
namelist = 
for i in range(0,7):
 if init == 0:
 result=scrape(url)
 init=init+1
 namelist.append(result[1])
 newurl = result[0]
 continue
 result = scrape(newurl)
 namelist.append(result[1])
 newurl = result[0]

print(namelist)

Sample page:-

<html>
<head>
<title>contacts</title>
</head>
<body>
<h1>People that Codie knows</h1>
<div class="example" >
<center>
 Hello world
</center>
</div>
<ul>
<li style="margin-top: 5px;"><a href="http://example.com/name_Kalvyn.html">Kalvyn</a></li>
<li style="margin-top: 15px;"><a href="http://example.com/name_Zhi.html">Zhi</a></li>
<li style="margin-top: 19px;"><a href="http://example.com/name_Yusuf.html">Yusuf</a></li>
<li style="margin-top: 12px;"><a href="http://example.com/name_Andrei.html">Andrei</a></li>
<li style="margin-top: 21px;"><a href="http://example.com/name_Roba.html">Roba</a></li>
<li style="margin-top: 4px;"><a href="http://example.com/name_Aya.html">Aya</a></li>
<li style="margin-top: 12px;"><a href="http://example.com/name_Fynlay.html">Fynlay</a></li>
<li style="margin-top: 32px;"><a href="http://example.com/name_Tahlia.html">Tahlia</a></li>
<li style="margin-top: 3px;"><a href="http://example.com/name_Beatrice.html">Beatrice</a></li>
<li style="margin-top: 15px;"><a href="http://example.com/name_Rhona.html">Rhona</a></li>
<li style="margin-top: 11px;"><a href="http://example.com/name_Dean.html">Dean</a></li>
<li style="margin-top: 25px;"><a href="http://example.com/name_Cassidy.html">Cassidy</a></li>
<li style="margin-top: 17px;"><a href="http://example.com/name_Charlie.html">Charlie</a></li>
</ul>
</body>
</html>

asked Jan 27 at 16:00

satch_boogie

1948

asked Jan 27 at 16:00

satch_boogie

1948

asked Jan 27 at 16:00

satch_boogie

1948

asked Jan 27 at 16:00

satch_boogie

1948

add a commentÂ |Â

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f186138%2fcrawls-a-site-repeat-7-times-and-gets-name-at-final-page%23new-answer', 'question_page');

);

Post as a guest

Name

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr