Scraping content from yellowpage and write them to a csv file

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
1
down vote

favorite

I've written a script in python to scrape names and phone numbers of a certain search from yellowpage traversing multiple pages. The scraper has the ability to collect all the links connected to all the names. Once the links are collected, the scraper goes to the target page to parse the name and website address. Finally, it will produce a csv file with data filled-in. Although the scraper is doing it's job errorlessly, I'm little dubious as to whether I used the best way to write items in a csv file. Any betterment to the pattern will be highly appreciated.

Here is what I've written:

import requests
import csv
from urllib.parse import urljoin
from bs4 import BeautifulSoup

url = "https://www.yellowpages.com/search?search_terms=+Injury+Law+Attorneys&geo_location_terms=California"

def get_links(session,link): 
 session.headers['User-Agent'] = 'Mozilla/5.0'
 res = session.get(link)
 soup = BeautifulSoup(res.text,"lxml")
 for items in soup.select(".info h2 a[data-analytics]"):
 get_info(session,urljoin(url,items.get("href")))

 nextpage = soup.select_one("a.next")
 if nextpage: #If there is no more next page link, it should break
 nexturl = urljoin(link,nextpage.get("href"))
 print(nexturl) #check to see which page I'm on
 get_links(session,nexturl) #supply the newly produced next page link to grab the next one

def get_info(session,tlink):
 session.headers['User-Agent'] = 'Mozilla/5.0'
 res = session.get(tlink)
 soup = BeautifulSoup(res.text,"lxml")
 name = soup.select_one("#main-header .sales-info h1").get_text(strip=True)
 website = soup.select_one("#main-header a.website-link").get("href") if soup.select_one("#main-header a.website-link") else ""
 print(name,website)
 writer.writerow([name,website]) #writing data in a csv file

if __name__ == '__main__':
 with open("yellowpageinfo.csv","w",newline="") as infile: #creating a csv file to write populated results
 writer = csv.writer(infile)
 writer.writerow(['Name','Website'])
 with requests.Session() as session:
 get_links(session,url)

edited Jul 13 at 21:28

asked Jul 13 at 21:23

asmitu

313

add a commentÂ |Â

up vote
1
down vote

favorite

Here is what I've written:

import requests
import csv
from urllib.parse import urljoin
from bs4 import BeautifulSoup

url = "https://www.yellowpages.com/search?search_terms=+Injury+Law+Attorneys&geo_location_terms=California"

def get_links(session,link): 
 session.headers['User-Agent'] = 'Mozilla/5.0'
 res = session.get(link)
 soup = BeautifulSoup(res.text,"lxml")
 for items in soup.select(".info h2 a[data-analytics]"):
 get_info(session,urljoin(url,items.get("href")))

 nextpage = soup.select_one("a.next")
 if nextpage: #If there is no more next page link, it should break
 nexturl = urljoin(link,nextpage.get("href"))
 print(nexturl) #check to see which page I'm on
 get_links(session,nexturl) #supply the newly produced next page link to grab the next one

def get_info(session,tlink):
 session.headers['User-Agent'] = 'Mozilla/5.0'
 res = session.get(tlink)
 soup = BeautifulSoup(res.text,"lxml")
 name = soup.select_one("#main-header .sales-info h1").get_text(strip=True)
 website = soup.select_one("#main-header a.website-link").get("href") if soup.select_one("#main-header a.website-link") else ""
 print(name,website)
 writer.writerow([name,website]) #writing data in a csv file

if __name__ == '__main__':
 with open("yellowpageinfo.csv","w",newline="") as infile: #creating a csv file to write populated results
 writer = csv.writer(infile)
 writer.writerow(['Name','Website'])
 with requests.Session() as session:
 get_links(session,url)

edited Jul 13 at 21:28

asked Jul 13 at 21:23

asmitu

313

add a commentÂ |Â

up vote
1
down vote

favorite

Here is what I've written:

import requests
import csv
from urllib.parse import urljoin
from bs4 import BeautifulSoup

url = "https://www.yellowpages.com/search?search_terms=+Injury+Law+Attorneys&geo_location_terms=California"

def get_links(session,link): 
 session.headers['User-Agent'] = 'Mozilla/5.0'
 res = session.get(link)
 soup = BeautifulSoup(res.text,"lxml")
 for items in soup.select(".info h2 a[data-analytics]"):
 get_info(session,urljoin(url,items.get("href")))

 nextpage = soup.select_one("a.next")
 if nextpage: #If there is no more next page link, it should break
 nexturl = urljoin(link,nextpage.get("href"))
 print(nexturl) #check to see which page I'm on
 get_links(session,nexturl) #supply the newly produced next page link to grab the next one

def get_info(session,tlink):
 session.headers['User-Agent'] = 'Mozilla/5.0'
 res = session.get(tlink)
 soup = BeautifulSoup(res.text,"lxml")
 name = soup.select_one("#main-header .sales-info h1").get_text(strip=True)
 website = soup.select_one("#main-header a.website-link").get("href") if soup.select_one("#main-header a.website-link") else ""
 print(name,website)
 writer.writerow([name,website]) #writing data in a csv file

if __name__ == '__main__':
 with open("yellowpageinfo.csv","w",newline="") as infile: #creating a csv file to write populated results
 writer = csv.writer(infile)
 writer.writerow(['Name','Website'])
 with requests.Session() as session:
 get_links(session,url)

edited Jul 13 at 21:28

asked Jul 13 at 21:23

asmitu

313

Here is what I've written:

import requests
import csv
from urllib.parse import urljoin
from bs4 import BeautifulSoup

url = "https://www.yellowpages.com/search?search_terms=+Injury+Law+Attorneys&geo_location_terms=California"

def get_links(session,link): 
 session.headers['User-Agent'] = 'Mozilla/5.0'
 res = session.get(link)
 soup = BeautifulSoup(res.text,"lxml")
 for items in soup.select(".info h2 a[data-analytics]"):
 get_info(session,urljoin(url,items.get("href")))

 nextpage = soup.select_one("a.next")
 if nextpage: #If there is no more next page link, it should break
 nexturl = urljoin(link,nextpage.get("href"))
 print(nexturl) #check to see which page I'm on
 get_links(session,nexturl) #supply the newly produced next page link to grab the next one

def get_info(session,tlink):
 session.headers['User-Agent'] = 'Mozilla/5.0'
 res = session.get(tlink)
 soup = BeautifulSoup(res.text,"lxml")
 name = soup.select_one("#main-header .sales-info h1").get_text(strip=True)
 website = soup.select_one("#main-header a.website-link").get("href") if soup.select_one("#main-header a.website-link") else ""
 print(name,website)
 writer.writerow([name,website]) #writing data in a csv file

if __name__ == '__main__':
 with open("yellowpageinfo.csv","w",newline="") as infile: #creating a csv file to write populated results
 writer = csv.writer(infile)
 writer.writerow(['Name','Website'])
 with requests.Session() as session:
 get_links(session,url)

edited Jul 13 at 21:28

asked Jul 13 at 21:23

asmitu

313

edited Jul 13 at 21:28

asked Jul 13 at 21:23

asmitu

313

asked Jul 13 at 21:23

asmitu

313

asked Jul 13 at 21:23

asmitu

313

add a commentÂ |Â

1 Answer
1

active

oldest

votes

up vote
1
down vote

Your description and your code don't seem to match. In the code I don't see anything related to phone numbers. I also don't understand what "The scraper has the ability to collect all the links connected to all the names." means. With that said, I'm not sure if any of my critique is valid, but here I go any way.

To answer your question about writing to csv, I think you are doing it right. In fact, you are doing it exactly as shown in documentation, so no issues here.

Now I'd like to get into details.

Minor issues

Format your code

Run autopep8 and isort on your code before submitting it. Makes it easier to quickly scan through it.

Use consistent string quoting

It doesn't matter whether you use single or double quotes. What matters is that you pick one and stick with it.

Get rid of useless comments

Consider:
```
writer.writerow([name, website]) # writing data in a csv file
```
This comment is virtually useless, because the code right beside it says the same thing. Comments like this explain nothing and should be deleted.

Variable naming
- Spare extra characters to rename res to response
- Avoid generic names (e.g. rename item to something like search_result)
- Use underscore to separate individual words in variable name (e.g. nextpage Ã¢Â†Â’next_page)
Good variable names often eliminate the need for comments. Effort spent on coming up with concise and descriptive names goes a long way towards improving readability.

More substantial issues

This line is way too long.
```
 website = soup.select_one("#main-header a.website-link").get("href") if soup.select_one("#main-header a.website-link") else ""
```
Consider rewriting it as an ordinary if statement. Save the result of soup.select_one(...) into a variable to avoid repeated function call.

Use of recursion in get_links is confusing. Consider rewriting it using a while loop.

Some things I might have misunderstood

There doesn't seem be any need to use requests.Session and on top of that reset headers before every call to get. If you really need a persistent session, it'd be a good idea to leave a comment explaining why.

In your code you seem to be only grabbing name and website for each search result. If that's the case, loading the details page for each result is redundant as all desired information is present in the search results themselves.

answered Jul 14 at 17:41

Simon

112

Hey, welcome to Code Review! Nice first answer! Regarding requests.Session: this is not a session in the sense of you get a session ID and so on. This just means the connection to the server is kept alive and re-used for subsequent requests, which makes multiple requests to the same server faster.
â€“Â Graipher
Jul 14 at 18:18

Thanks for the clarification! From the documentation on requests.Session I got the impression that its primary purpose is to persist cookies across multiple requests, but now I know better.
â€“Â Simon
Jul 14 at 18:39

That is one other usecase for it (in addition to persisting other attributes of the request so you don't have to set them again every time).
â€“Â Graipher
Jul 14 at 18:40

add a commentÂ |Â

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f198459%2fscraping-content-from-yellowpage-and-write-them-to-a-csv-file%23new-answer', 'question_page');

);

Post as a guest

Name

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
1
down vote

To answer your question about writing to csv, I think you are doing it right. In fact, you are doing it exactly as shown in documentation, so no issues here.

Now I'd like to get into details.

Minor issues

Format your code

Run autopep8 and isort on your code before submitting it. Makes it easier to quickly scan through it.

Use consistent string quoting

It doesn't matter whether you use single or double quotes. What matters is that you pick one and stick with it.

Get rid of useless comments

Consider:
```
writer.writerow([name, website]) # writing data in a csv file
```
This comment is virtually useless, because the code right beside it says the same thing. Comments like this explain nothing and should be deleted.

Variable naming
- Spare extra characters to rename res to response
- Avoid generic names (e.g. rename item to something like search_result)
- Use underscore to separate individual words in variable name (e.g. nextpage Ã¢Â†Â’next_page)
Good variable names often eliminate the need for comments. Effort spent on coming up with concise and descriptive names goes a long way towards improving readability.

More substantial issues

This line is way too long.
```
 website = soup.select_one("#main-header a.website-link").get("href") if soup.select_one("#main-header a.website-link") else ""
```
Consider rewriting it as an ordinary if statement. Save the result of soup.select_one(...) into a variable to avoid repeated function call.

Use of recursion in get_links is confusing. Consider rewriting it using a while loop.

Some things I might have misunderstood

There doesn't seem be any need to use requests.Session and on top of that reset headers before every call to get. If you really need a persistent session, it'd be a good idea to leave a comment explaining why.

In your code you seem to be only grabbing name and website for each search result. If that's the case, loading the details page for each result is redundant as all desired information is present in the search results themselves.

answered Jul 14 at 17:41

Simon

112

Hey, welcome to Code Review! Nice first answer! Regarding requests.Session: this is not a session in the sense of you get a session ID and so on. This just means the connection to the server is kept alive and re-used for subsequent requests, which makes multiple requests to the same server faster.
â€“Â Graipher
Jul 14 at 18:18

Thanks for the clarification! From the documentation on requests.Session I got the impression that its primary purpose is to persist cookies across multiple requests, but now I know better.
â€“Â Simon
Jul 14 at 18:39

That is one other usecase for it (in addition to persisting other attributes of the request so you don't have to set them again every time).
â€“Â Graipher
Jul 14 at 18:40

add a commentÂ |Â

up vote
1
down vote

To answer your question about writing to csv, I think you are doing it right. In fact, you are doing it exactly as shown in documentation, so no issues here.

Now I'd like to get into details.

Minor issues

Format your code

Run autopep8 and isort on your code before submitting it. Makes it easier to quickly scan through it.

Use consistent string quoting

It doesn't matter whether you use single or double quotes. What matters is that you pick one and stick with it.

Get rid of useless comments

Consider:
```
writer.writerow([name, website]) # writing data in a csv file
```
This comment is virtually useless, because the code right beside it says the same thing. Comments like this explain nothing and should be deleted.

Variable naming
- Spare extra characters to rename res to response
- Avoid generic names (e.g. rename item to something like search_result)
- Use underscore to separate individual words in variable name (e.g. nextpage Ã¢Â†Â’next_page)
Good variable names often eliminate the need for comments. Effort spent on coming up with concise and descriptive names goes a long way towards improving readability.

More substantial issues

This line is way too long.
```
 website = soup.select_one("#main-header a.website-link").get("href") if soup.select_one("#main-header a.website-link") else ""
```
Consider rewriting it as an ordinary if statement. Save the result of soup.select_one(...) into a variable to avoid repeated function call.

Use of recursion in get_links is confusing. Consider rewriting it using a while loop.

Some things I might have misunderstood

There doesn't seem be any need to use requests.Session and on top of that reset headers before every call to get. If you really need a persistent session, it'd be a good idea to leave a comment explaining why.

In your code you seem to be only grabbing name and website for each search result. If that's the case, loading the details page for each result is redundant as all desired information is present in the search results themselves.

answered Jul 14 at 17:41

Simon

112

Hey, welcome to Code Review! Nice first answer! Regarding requests.Session: this is not a session in the sense of you get a session ID and so on. This just means the connection to the server is kept alive and re-used for subsequent requests, which makes multiple requests to the same server faster.
â€“Â Graipher
Jul 14 at 18:18

Thanks for the clarification! From the documentation on requests.Session I got the impression that its primary purpose is to persist cookies across multiple requests, but now I know better.
â€“Â Simon
Jul 14 at 18:39

That is one other usecase for it (in addition to persisting other attributes of the request so you don't have to set them again every time).
â€“Â Graipher
Jul 14 at 18:40

add a commentÂ |Â

up vote
1
down vote

To answer your question about writing to csv, I think you are doing it right. In fact, you are doing it exactly as shown in documentation, so no issues here.

Now I'd like to get into details.

Minor issues

Format your code

Run autopep8 and isort on your code before submitting it. Makes it easier to quickly scan through it.

Use consistent string quoting

It doesn't matter whether you use single or double quotes. What matters is that you pick one and stick with it.

Get rid of useless comments

Consider:
```
writer.writerow([name, website]) # writing data in a csv file
```
This comment is virtually useless, because the code right beside it says the same thing. Comments like this explain nothing and should be deleted.

Variable naming
- Spare extra characters to rename res to response
- Avoid generic names (e.g. rename item to something like search_result)
- Use underscore to separate individual words in variable name (e.g. nextpage Ã¢Â†Â’next_page)
Good variable names often eliminate the need for comments. Effort spent on coming up with concise and descriptive names goes a long way towards improving readability.

More substantial issues

This line is way too long.
```
 website = soup.select_one("#main-header a.website-link").get("href") if soup.select_one("#main-header a.website-link") else ""
```
Consider rewriting it as an ordinary if statement. Save the result of soup.select_one(...) into a variable to avoid repeated function call.

Use of recursion in get_links is confusing. Consider rewriting it using a while loop.

Some things I might have misunderstood

There doesn't seem be any need to use requests.Session and on top of that reset headers before every call to get. If you really need a persistent session, it'd be a good idea to leave a comment explaining why.

In your code you seem to be only grabbing name and website for each search result. If that's the case, loading the details page for each result is redundant as all desired information is present in the search results themselves.

answered Jul 14 at 17:41

Simon

112

To answer your question about writing to csv, I think you are doing it right. In fact, you are doing it exactly as shown in documentation, so no issues here.

Now I'd like to get into details.

Minor issues

Format your code

Run autopep8 and isort on your code before submitting it. Makes it easier to quickly scan through it.

Use consistent string quoting

It doesn't matter whether you use single or double quotes. What matters is that you pick one and stick with it.

Get rid of useless comments

Consider:
```
writer.writerow([name, website]) # writing data in a csv file
```
This comment is virtually useless, because the code right beside it says the same thing. Comments like this explain nothing and should be deleted.

Variable naming
- Spare extra characters to rename res to response
- Avoid generic names (e.g. rename item to something like search_result)
- Use underscore to separate individual words in variable name (e.g. nextpage Ã¢Â†Â’next_page)
Good variable names often eliminate the need for comments. Effort spent on coming up with concise and descriptive names goes a long way towards improving readability.

More substantial issues

This line is way too long.
```
 website = soup.select_one("#main-header a.website-link").get("href") if soup.select_one("#main-header a.website-link") else ""
```
Consider rewriting it as an ordinary if statement. Save the result of soup.select_one(...) into a variable to avoid repeated function call.

Use of recursion in get_links is confusing. Consider rewriting it using a while loop.

Some things I might have misunderstood

There doesn't seem be any need to use requests.Session and on top of that reset headers before every call to get. If you really need a persistent session, it'd be a good idea to leave a comment explaining why.

In your code you seem to be only grabbing name and website for each search result. If that's the case, loading the details page for each result is redundant as all desired information is present in the search results themselves.

answered Jul 14 at 17:41

Simon

112

answered Jul 14 at 17:41

Simon

112

answered Jul 14 at 17:41

Simon

112

answered Jul 14 at 17:41

Simon

112

Hey, welcome to Code Review! Nice first answer! Regarding requests.Session: this is not a session in the sense of you get a session ID and so on. This just means the connection to the server is kept alive and re-used for subsequent requests, which makes multiple requests to the same server faster.
â€“Â Graipher
Jul 14 at 18:18

Thanks for the clarification! From the documentation on requests.Session I got the impression that its primary purpose is to persist cookies across multiple requests, but now I know better.
â€“Â Simon
Jul 14 at 18:39

That is one other usecase for it (in addition to persisting other attributes of the request so you don't have to set them again every time).
â€“Â Graipher
Jul 14 at 18:40

add a commentÂ |Â

Hey, welcome to Code Review! Nice first answer! Regarding requests.Session: this is not a session in the sense of you get a session ID and so on. This just means the connection to the server is kept alive and re-used for subsequent requests, which makes multiple requests to the same server faster.
â€“Â Graipher
Jul 14 at 18:18

Thanks for the clarification! From the documentation on requests.Session I got the impression that its primary purpose is to persist cookies across multiple requests, but now I know better.
â€“Â Simon
Jul 14 at 18:39

That is one other usecase for it (in addition to persisting other attributes of the request so you don't have to set them again every time).
â€“Â Graipher
Jul 14 at 18:40

Hey, welcome to Code Review! Nice first answer! Regarding requests.Session: this is not a session in the sense of you get a session ID and so on. This just means the connection to the server is kept alive and re-used for subsequent requests, which makes multiple requests to the same server faster.
â€“Â Graipher
Jul 14 at 18:18

Thanks for the clarification! From the documentation on requests.Session I got the impression that its primary purpose is to persist cookies across multiple requests, but now I know better.
â€“Â Simon
Jul 14 at 18:39

That is one other usecase for it (in addition to persisting other attributes of the request so you don't have to set them again every time).
â€“Â Graipher
Jul 14 at 18:40

add a commentÂ |Â

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

Scraping content from yellowpage and write them to a csv file

1 Answer 1

Minor issues

Format your code

Use consistent string quoting

Get rid of useless comments

Variable naming

More substantial issues

Some things I might have misunderstood

Your Answer

Sign up or log in

Post as a guest

Post as a guest

1 Answer 1

1 Answer 1

Minor issues

Format your code

Use consistent string quoting

Get rid of useless comments

Variable naming

More substantial issues

Some things I might have misunderstood

Minor issues

Format your code

Use consistent string quoting

Get rid of useless comments

Variable naming

More substantial issues

Some things I might have misunderstood

Minor issues

Format your code

Use consistent string quoting

Get rid of useless comments

Variable naming

More substantial issues

Some things I might have misunderstood

Minor issues

Format your code

Use consistent string quoting

Get rid of useless comments

Variable naming

More substantial issues

Some things I might have misunderstood

Sign up or log in

Post as a guest

Post as a guest

Sign up or log in

Post as a guest

Sign up or log in

Post as a guest

Sign up or log in

Post as a guest

Popular posts from this blog

Chat program with C++ and SFML

Read an image with ADNS2610 optical sensor and Arduino Uno

Read files from a directory using Promises

1 Answer
1

1 Answer
1

1 Answer
1