Grabbing information traversing multiple pages

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
2
down vote

favorite

I've written a script in python in combination with selenium to parse different information from a webpage and store the collected data in a csv file. The data are rightly coming through. The email portion looks weird cause I wrapped two conditions in a single line to grab all the leads (in case one missing, another will come into play) However, the way I've used csv writer doesn't look that good. I hope there is any better way of implementation of the logic I pursued below.

This is the script:

import csv
from selenium import webdriver
from bs4 import BeautifulSoup

urls = "https://www.krak.dk/byggefirmaer/p:/s%C3%B8g.cs?xcoord=90.3636094&ycoord=23.8150673"

def get_info(driver,urls):
 with open("output.csv","w",newline="") as infile:
 writer = csv.writer(infile)
 writer.writerow(["Name","Link","Email"])

 for url in [urls.format(page) for page in range(1,3)]:
 driver.get(url)
 soup = BeautifulSoup(driver.page_source,'html.parser')
 for links in soup.select("article.vcard"):
 name = links.select_one(".hit-company-name-ellipsis a").get_text(strip=True)
 link = links.select_one(".hit-homepage-link").get("href") if links.select_one(".hit-homepage-link") else ""
 email = links.select_one(".hit-footer-wrapper span[data-mail]").get("data-mail") or links.select_one(".hit-footer-wrapper span[data-mail-e-contact-mail]").get("data-mail-e-contact-mail")
 print(f'namenlinknemailn')
 writer.writerow([name,link,email])

if __name__ == '__main__':
 driver = webdriver.Chrome()
 try:
 get_info(driver,urls)
 finally:
 driver.quit()

Currently the above script traverses three pages and scrapes three fields name,link,email.

asked Jun 25 at 10:33

Topto

2158

There are small changes I would make (not related to messy usage of links), but in my experience has been web scraping can get very messy.
â€“Â Dair
Jun 25 at 20:42

add a commentÂ |Â

up vote
2
down vote

favorite

This is the script:

import csv
from selenium import webdriver
from bs4 import BeautifulSoup

urls = "https://www.krak.dk/byggefirmaer/p:/s%C3%B8g.cs?xcoord=90.3636094&ycoord=23.8150673"

def get_info(driver,urls):
 with open("output.csv","w",newline="") as infile:
 writer = csv.writer(infile)
 writer.writerow(["Name","Link","Email"])

 for url in [urls.format(page) for page in range(1,3)]:
 driver.get(url)
 soup = BeautifulSoup(driver.page_source,'html.parser')
 for links in soup.select("article.vcard"):
 name = links.select_one(".hit-company-name-ellipsis a").get_text(strip=True)
 link = links.select_one(".hit-homepage-link").get("href") if links.select_one(".hit-homepage-link") else ""
 email = links.select_one(".hit-footer-wrapper span[data-mail]").get("data-mail") or links.select_one(".hit-footer-wrapper span[data-mail-e-contact-mail]").get("data-mail-e-contact-mail")
 print(f'namenlinknemailn')
 writer.writerow([name,link,email])

if __name__ == '__main__':
 driver = webdriver.Chrome()
 try:
 get_info(driver,urls)
 finally:
 driver.quit()

Currently the above script traverses three pages and scrapes three fields name,link,email.

asked Jun 25 at 10:33

Topto

2158

There are small changes I would make (not related to messy usage of links), but in my experience has been web scraping can get very messy.
â€“Â Dair
Jun 25 at 20:42

add a commentÂ |Â

up vote
2
down vote

favorite

This is the script:

import csv
from selenium import webdriver
from bs4 import BeautifulSoup

urls = "https://www.krak.dk/byggefirmaer/p:/s%C3%B8g.cs?xcoord=90.3636094&ycoord=23.8150673"

def get_info(driver,urls):
 with open("output.csv","w",newline="") as infile:
 writer = csv.writer(infile)
 writer.writerow(["Name","Link","Email"])

 for url in [urls.format(page) for page in range(1,3)]:
 driver.get(url)
 soup = BeautifulSoup(driver.page_source,'html.parser')
 for links in soup.select("article.vcard"):
 name = links.select_one(".hit-company-name-ellipsis a").get_text(strip=True)
 link = links.select_one(".hit-homepage-link").get("href") if links.select_one(".hit-homepage-link") else ""
 email = links.select_one(".hit-footer-wrapper span[data-mail]").get("data-mail") or links.select_one(".hit-footer-wrapper span[data-mail-e-contact-mail]").get("data-mail-e-contact-mail")
 print(f'namenlinknemailn')
 writer.writerow([name,link,email])

if __name__ == '__main__':
 driver = webdriver.Chrome()
 try:
 get_info(driver,urls)
 finally:
 driver.quit()

Currently the above script traverses three pages and scrapes three fields name,link,email.

asked Jun 25 at 10:33

Topto

2158

This is the script:

import csv
from selenium import webdriver
from bs4 import BeautifulSoup

urls = "https://www.krak.dk/byggefirmaer/p:/s%C3%B8g.cs?xcoord=90.3636094&ycoord=23.8150673"

def get_info(driver,urls):
 with open("output.csv","w",newline="") as infile:
 writer = csv.writer(infile)
 writer.writerow(["Name","Link","Email"])

 for url in [urls.format(page) for page in range(1,3)]:
 driver.get(url)
 soup = BeautifulSoup(driver.page_source,'html.parser')
 for links in soup.select("article.vcard"):
 name = links.select_one(".hit-company-name-ellipsis a").get_text(strip=True)
 link = links.select_one(".hit-homepage-link").get("href") if links.select_one(".hit-homepage-link") else ""
 email = links.select_one(".hit-footer-wrapper span[data-mail]").get("data-mail") or links.select_one(".hit-footer-wrapper span[data-mail-e-contact-mail]").get("data-mail-e-contact-mail")
 print(f'namenlinknemailn')
 writer.writerow([name,link,email])

if __name__ == '__main__':
 driver = webdriver.Chrome()
 try:
 get_info(driver,urls)
 finally:
 driver.quit()

Currently the above script traverses three pages and scrapes three fields name,link,email.

asked Jun 25 at 10:33

Topto

2158

asked Jun 25 at 10:33

Topto

2158

asked Jun 25 at 10:33

Topto

2158

asked Jun 25 at 10:33

Topto

2158

There are small changes I would make (not related to messy usage of links), but in my experience has been web scraping can get very messy.
â€“Â Dair
Jun 25 at 20:42

add a commentÂ |Â

There are small changes I would make (not related to messy usage of links), but in my experience has been web scraping can get very messy.
â€“Â Dair
Jun 25 at 20:42

There are small changes I would make (not related to messy usage of links), but in my experience has been web scraping can get very messy.
â€“Â Dair
Jun 25 at 20:42

add a commentÂ |Â

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f197201%2fgrabbing-information-traversing-multiple-pages%23new-answer', 'question_page');

);

Post as a guest

Name

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr