Extracting data from a used car sales site
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
1
down vote
favorite
I am developing code for extracting data from a used car sales site. There are 4 sites in total. In 3 of them I use requests and beautifulsoup. The time taken to extract data from these sites was satisfactory. The problem is that the extraction of data from the site whose class is called Localiza
is taking too long. It takes almost 20 minutes to extract sales data from the 6000 cars. Could anyone any tips on how to reduce the time of scraping this site?
This is the parent class of class Localiza
. Forgive me for the lack of indentation in the name of the class. Guilty of the question editor, I could not fix it.
from bs4 import BeautifulSoup as bs
from selenium import webdriver as wb
import requests as req
class SiteVendaSeminovos:
def __init__(self, url, emprise_name):
self.__base_url = url
self.__page_index = 1
self.__emprise_name = emprise_name
self.__soup = None
def goto_next_page(self):
self.__page_index += 1
@property
def base_url(self):
return self.__base_url
@property
def page_index(self):
return self.__page_index
@property
def soup(self):
return self.__soup
def set_soup(self):
r = req.get(self.__base_url.format(self.__page_index))
self.__soup = bs(r.text, "lxml")
def is_finished(self):
pass
def get_cars(self):
pass
@property
def emprise_name(self):
return self.__emprise_name
def get_price(self, car):
pass
def get_kilometragem(self, car):
pass
def get_model(self, car):
pass
def get_year(self, car):
pass
This is the class Localiza
, which takes 20 minutes to get data from 400 pages, plus or minus 6000 cars. Again, sorry for the lack of indentation in the class name.
class Localiza(SiteVendaSeminovos):
def __init__(self, url, emprise_name):
super().__init__(url, emprise_name)
#self.__web_driver = wb.Chrome("/home/rafa/Documentos/web-scrap/chromedriver")
self.__web_driver = wb.PhantomJS("/home/rafa/Documentos/web-scrap/phantomjs")
self.__web_driver.get(url)
self.__id_next_page = "ctl00_ctl42_g_f221d036_75d3_4ee2_893d_0d7b40180245_ProximaPaginaSuperior"
self.__finished = False
def set_soup(self):
self.__soup = bs(self.__web_driver.page_source, "lxml")
def is_finished(self):
return self.__finished
def get_cars(self):
price = self.__soup.find_all(class_="busca-right-container")
cars = self.__soup.find_all(class_="busca-left-container")
return [(price, list(car.stripped_strings)) for car, price in zip(cars, price)]
def get_year(self, car):
return car[1][1].split("/")[0]
def get_kilometragem(self, car):
return car[1][2]
def get_model(self, car):
return car[1][0]
def get_price(self, car):
return list(car[0].stripped_strings)[0][3:].replace(".", "")
def goto_next_page(self):
try:
self.__web_driver.find_element_by_id(self.__id_next_page).click()
except:
self.__finished = True
Main:
from classes import *
import sqlite3 as sqlt
import json
with sqlt.connect("seminovos.db"), json.load(open("urls_sql.json")) as urls_sql:
cursor = con.cursor()
cursor.execute(urls_sql["criar_tabela_sql"]) #cria a tabela "venda" no DB
inserir_dados_sql = urls_sql["inserir_dados_sql"] # Seta a query de insercao de dados na tabela do DB
localiza = Localiza(urls_sql["url_localiza"], "Localiza")
data =
for seller_site in [localiza]:
while True:
seller_site.set_soup()
if seller_site.is_finished():
break
for car in seller_site.get_cars():
data["Empresa"] = seller_site.emprise_name
data["Modelo"] = seller_site.get_model(car)
data["Preco"] = seller_site.get_price(car)
data["Kilometragem"] = seller_site.get_kilometragem(car)
data["Ano"] = seller_site.get_year(car)
cursor.execute(inserir_dados_sql.format(**data))
con.commit()
seller_site.goto_next_page()
JSON with the Localiza site URL and SQL queries:
"url_localiza" : "https://seminovos.localiza.com/Paginas/resultado-busca.aspx?ct=4365_2002_8466_8607_8655_4389_2604_2612_8096_1734_4720_8719_3970_7267_7300_2826_8146_5758_6667_565_8167_1307_2108_7478_8875_2372_6698_8220_4777_8234_3159_8987_6018_108_4498_9040_6057_9061_6974_6744_6749_9123_7690_9185_7719_5210_6797_957_9317_9328_9332_9352_7876_2453_9362_1968_9391_9420_5454_3873_3874_4337_1987_1081_6875&st=AL_BA_CE_DF_ES_GO_MA_MG_MS_MT_PA_PB_PE_PI_PR_RJ_RN_RS_SC_SE_SP&yr=2013_2018&pc=20000_425000&fb=W_X_T_%C3%94_A_D_C_L_1_8_F_M_U_O_R_G_B&md=000192_000097_000148_000147_000136_000119_000137_000120_000729_001061_000132_000632_000699_000122_000041_000286_000772_000719_000180_000181_000179_000715_000736_000854_000748_001076_000334_000250_000333_000330_000332_000369_000424_000545_000511_000488_000408_000418_000510_000513_000391_000211_000431_001083_000325_000326_000344_000456_000451_000455_000458_000505_000506_000502_000805_000426_000427_000623_000726_000718_000119_000132_000694_000788_000828_000005_000484_000478_000481_000476_000473_000477_001017_001016_000356_000357_000319_000317_000297_000298_000354_000123_000780_000859_000138_001006_000614_000867_000858_000675_000139_000165_000171_000174_000039_000020_000022_000019_000747_000320_000365_000311_000355_000312_000313_000322_000342_000353_000362_000314_000699_000696_000114_000779_000781_000755_000807_000806_001084_000529_001063_001098_000667_001036_001077_000133_000143_000658_000705_000707_000047",
"criar_tabela_sql" : "CREATE TABLE vendas ( ID INTEGER PRIMARY KEY AUTOINCREMENT, Empresa VARCHAR (12) NOT NULL, Modelo VARCHAR(40) NOT NULL, Preco REAL NOT NULL, Kilometragem REAL NOT NULL, Ano NUMERIC(4,0) NOT NULL );",
"inserir_dados_sql":"INSERT INTO vendas (Empresa, Modelo, Preco, Kilometragem, Ano) VALUES ('Empresa','Modelo', Preco,Kilometragem,Ano)"
python web-scraping sqlite beautifulsoup selenium
add a comment |Â
up vote
1
down vote
favorite
I am developing code for extracting data from a used car sales site. There are 4 sites in total. In 3 of them I use requests and beautifulsoup. The time taken to extract data from these sites was satisfactory. The problem is that the extraction of data from the site whose class is called Localiza
is taking too long. It takes almost 20 minutes to extract sales data from the 6000 cars. Could anyone any tips on how to reduce the time of scraping this site?
This is the parent class of class Localiza
. Forgive me for the lack of indentation in the name of the class. Guilty of the question editor, I could not fix it.
from bs4 import BeautifulSoup as bs
from selenium import webdriver as wb
import requests as req
class SiteVendaSeminovos:
def __init__(self, url, emprise_name):
self.__base_url = url
self.__page_index = 1
self.__emprise_name = emprise_name
self.__soup = None
def goto_next_page(self):
self.__page_index += 1
@property
def base_url(self):
return self.__base_url
@property
def page_index(self):
return self.__page_index
@property
def soup(self):
return self.__soup
def set_soup(self):
r = req.get(self.__base_url.format(self.__page_index))
self.__soup = bs(r.text, "lxml")
def is_finished(self):
pass
def get_cars(self):
pass
@property
def emprise_name(self):
return self.__emprise_name
def get_price(self, car):
pass
def get_kilometragem(self, car):
pass
def get_model(self, car):
pass
def get_year(self, car):
pass
This is the class Localiza
, which takes 20 minutes to get data from 400 pages, plus or minus 6000 cars. Again, sorry for the lack of indentation in the class name.
class Localiza(SiteVendaSeminovos):
def __init__(self, url, emprise_name):
super().__init__(url, emprise_name)
#self.__web_driver = wb.Chrome("/home/rafa/Documentos/web-scrap/chromedriver")
self.__web_driver = wb.PhantomJS("/home/rafa/Documentos/web-scrap/phantomjs")
self.__web_driver.get(url)
self.__id_next_page = "ctl00_ctl42_g_f221d036_75d3_4ee2_893d_0d7b40180245_ProximaPaginaSuperior"
self.__finished = False
def set_soup(self):
self.__soup = bs(self.__web_driver.page_source, "lxml")
def is_finished(self):
return self.__finished
def get_cars(self):
price = self.__soup.find_all(class_="busca-right-container")
cars = self.__soup.find_all(class_="busca-left-container")
return [(price, list(car.stripped_strings)) for car, price in zip(cars, price)]
def get_year(self, car):
return car[1][1].split("/")[0]
def get_kilometragem(self, car):
return car[1][2]
def get_model(self, car):
return car[1][0]
def get_price(self, car):
return list(car[0].stripped_strings)[0][3:].replace(".", "")
def goto_next_page(self):
try:
self.__web_driver.find_element_by_id(self.__id_next_page).click()
except:
self.__finished = True
Main:
from classes import *
import sqlite3 as sqlt
import json
with sqlt.connect("seminovos.db"), json.load(open("urls_sql.json")) as urls_sql:
cursor = con.cursor()
cursor.execute(urls_sql["criar_tabela_sql"]) #cria a tabela "venda" no DB
inserir_dados_sql = urls_sql["inserir_dados_sql"] # Seta a query de insercao de dados na tabela do DB
localiza = Localiza(urls_sql["url_localiza"], "Localiza")
data =
for seller_site in [localiza]:
while True:
seller_site.set_soup()
if seller_site.is_finished():
break
for car in seller_site.get_cars():
data["Empresa"] = seller_site.emprise_name
data["Modelo"] = seller_site.get_model(car)
data["Preco"] = seller_site.get_price(car)
data["Kilometragem"] = seller_site.get_kilometragem(car)
data["Ano"] = seller_site.get_year(car)
cursor.execute(inserir_dados_sql.format(**data))
con.commit()
seller_site.goto_next_page()
JSON with the Localiza site URL and SQL queries:
"url_localiza" : "https://seminovos.localiza.com/Paginas/resultado-busca.aspx?ct=4365_2002_8466_8607_8655_4389_2604_2612_8096_1734_4720_8719_3970_7267_7300_2826_8146_5758_6667_565_8167_1307_2108_7478_8875_2372_6698_8220_4777_8234_3159_8987_6018_108_4498_9040_6057_9061_6974_6744_6749_9123_7690_9185_7719_5210_6797_957_9317_9328_9332_9352_7876_2453_9362_1968_9391_9420_5454_3873_3874_4337_1987_1081_6875&st=AL_BA_CE_DF_ES_GO_MA_MG_MS_MT_PA_PB_PE_PI_PR_RJ_RN_RS_SC_SE_SP&yr=2013_2018&pc=20000_425000&fb=W_X_T_%C3%94_A_D_C_L_1_8_F_M_U_O_R_G_B&md=000192_000097_000148_000147_000136_000119_000137_000120_000729_001061_000132_000632_000699_000122_000041_000286_000772_000719_000180_000181_000179_000715_000736_000854_000748_001076_000334_000250_000333_000330_000332_000369_000424_000545_000511_000488_000408_000418_000510_000513_000391_000211_000431_001083_000325_000326_000344_000456_000451_000455_000458_000505_000506_000502_000805_000426_000427_000623_000726_000718_000119_000132_000694_000788_000828_000005_000484_000478_000481_000476_000473_000477_001017_001016_000356_000357_000319_000317_000297_000298_000354_000123_000780_000859_000138_001006_000614_000867_000858_000675_000139_000165_000171_000174_000039_000020_000022_000019_000747_000320_000365_000311_000355_000312_000313_000322_000342_000353_000362_000314_000699_000696_000114_000779_000781_000755_000807_000806_001084_000529_001063_001098_000667_001036_001077_000133_000143_000658_000705_000707_000047",
"criar_tabela_sql" : "CREATE TABLE vendas ( ID INTEGER PRIMARY KEY AUTOINCREMENT, Empresa VARCHAR (12) NOT NULL, Modelo VARCHAR(40) NOT NULL, Preco REAL NOT NULL, Kilometragem REAL NOT NULL, Ano NUMERIC(4,0) NOT NULL );",
"inserir_dados_sql":"INSERT INTO vendas (Empresa, Modelo, Preco, Kilometragem, Ano) VALUES ('Empresa','Modelo', Preco,Kilometragem,Ano)"
python web-scraping sqlite beautifulsoup selenium
Look into performing profiling on your code to to identify the bottlenecks. If it cannot identify it, refactor your for loop operations into functions, and I'm pretty sure thefor car in seller_site.get_cars():
loop as the likely suspect.
â C. Harley
May 24 at 1:30
you commit too often. i think it should be safe to commit once the while is done. it may be tough on your ram depending on the amount of data gathered but i guess 6000 cars is not that much
â bobrobbob
May 25 at 15:59
add a comment |Â
up vote
1
down vote
favorite
up vote
1
down vote
favorite
I am developing code for extracting data from a used car sales site. There are 4 sites in total. In 3 of them I use requests and beautifulsoup. The time taken to extract data from these sites was satisfactory. The problem is that the extraction of data from the site whose class is called Localiza
is taking too long. It takes almost 20 minutes to extract sales data from the 6000 cars. Could anyone any tips on how to reduce the time of scraping this site?
This is the parent class of class Localiza
. Forgive me for the lack of indentation in the name of the class. Guilty of the question editor, I could not fix it.
from bs4 import BeautifulSoup as bs
from selenium import webdriver as wb
import requests as req
class SiteVendaSeminovos:
def __init__(self, url, emprise_name):
self.__base_url = url
self.__page_index = 1
self.__emprise_name = emprise_name
self.__soup = None
def goto_next_page(self):
self.__page_index += 1
@property
def base_url(self):
return self.__base_url
@property
def page_index(self):
return self.__page_index
@property
def soup(self):
return self.__soup
def set_soup(self):
r = req.get(self.__base_url.format(self.__page_index))
self.__soup = bs(r.text, "lxml")
def is_finished(self):
pass
def get_cars(self):
pass
@property
def emprise_name(self):
return self.__emprise_name
def get_price(self, car):
pass
def get_kilometragem(self, car):
pass
def get_model(self, car):
pass
def get_year(self, car):
pass
This is the class Localiza
, which takes 20 minutes to get data from 400 pages, plus or minus 6000 cars. Again, sorry for the lack of indentation in the class name.
class Localiza(SiteVendaSeminovos):
def __init__(self, url, emprise_name):
super().__init__(url, emprise_name)
#self.__web_driver = wb.Chrome("/home/rafa/Documentos/web-scrap/chromedriver")
self.__web_driver = wb.PhantomJS("/home/rafa/Documentos/web-scrap/phantomjs")
self.__web_driver.get(url)
self.__id_next_page = "ctl00_ctl42_g_f221d036_75d3_4ee2_893d_0d7b40180245_ProximaPaginaSuperior"
self.__finished = False
def set_soup(self):
self.__soup = bs(self.__web_driver.page_source, "lxml")
def is_finished(self):
return self.__finished
def get_cars(self):
price = self.__soup.find_all(class_="busca-right-container")
cars = self.__soup.find_all(class_="busca-left-container")
return [(price, list(car.stripped_strings)) for car, price in zip(cars, price)]
def get_year(self, car):
return car[1][1].split("/")[0]
def get_kilometragem(self, car):
return car[1][2]
def get_model(self, car):
return car[1][0]
def get_price(self, car):
return list(car[0].stripped_strings)[0][3:].replace(".", "")
def goto_next_page(self):
try:
self.__web_driver.find_element_by_id(self.__id_next_page).click()
except:
self.__finished = True
Main:
from classes import *
import sqlite3 as sqlt
import json
with sqlt.connect("seminovos.db"), json.load(open("urls_sql.json")) as urls_sql:
cursor = con.cursor()
cursor.execute(urls_sql["criar_tabela_sql"]) #cria a tabela "venda" no DB
inserir_dados_sql = urls_sql["inserir_dados_sql"] # Seta a query de insercao de dados na tabela do DB
localiza = Localiza(urls_sql["url_localiza"], "Localiza")
data =
for seller_site in [localiza]:
while True:
seller_site.set_soup()
if seller_site.is_finished():
break
for car in seller_site.get_cars():
data["Empresa"] = seller_site.emprise_name
data["Modelo"] = seller_site.get_model(car)
data["Preco"] = seller_site.get_price(car)
data["Kilometragem"] = seller_site.get_kilometragem(car)
data["Ano"] = seller_site.get_year(car)
cursor.execute(inserir_dados_sql.format(**data))
con.commit()
seller_site.goto_next_page()
JSON with the Localiza site URL and SQL queries:
"url_localiza" : "https://seminovos.localiza.com/Paginas/resultado-busca.aspx?ct=4365_2002_8466_8607_8655_4389_2604_2612_8096_1734_4720_8719_3970_7267_7300_2826_8146_5758_6667_565_8167_1307_2108_7478_8875_2372_6698_8220_4777_8234_3159_8987_6018_108_4498_9040_6057_9061_6974_6744_6749_9123_7690_9185_7719_5210_6797_957_9317_9328_9332_9352_7876_2453_9362_1968_9391_9420_5454_3873_3874_4337_1987_1081_6875&st=AL_BA_CE_DF_ES_GO_MA_MG_MS_MT_PA_PB_PE_PI_PR_RJ_RN_RS_SC_SE_SP&yr=2013_2018&pc=20000_425000&fb=W_X_T_%C3%94_A_D_C_L_1_8_F_M_U_O_R_G_B&md=000192_000097_000148_000147_000136_000119_000137_000120_000729_001061_000132_000632_000699_000122_000041_000286_000772_000719_000180_000181_000179_000715_000736_000854_000748_001076_000334_000250_000333_000330_000332_000369_000424_000545_000511_000488_000408_000418_000510_000513_000391_000211_000431_001083_000325_000326_000344_000456_000451_000455_000458_000505_000506_000502_000805_000426_000427_000623_000726_000718_000119_000132_000694_000788_000828_000005_000484_000478_000481_000476_000473_000477_001017_001016_000356_000357_000319_000317_000297_000298_000354_000123_000780_000859_000138_001006_000614_000867_000858_000675_000139_000165_000171_000174_000039_000020_000022_000019_000747_000320_000365_000311_000355_000312_000313_000322_000342_000353_000362_000314_000699_000696_000114_000779_000781_000755_000807_000806_001084_000529_001063_001098_000667_001036_001077_000133_000143_000658_000705_000707_000047",
"criar_tabela_sql" : "CREATE TABLE vendas ( ID INTEGER PRIMARY KEY AUTOINCREMENT, Empresa VARCHAR (12) NOT NULL, Modelo VARCHAR(40) NOT NULL, Preco REAL NOT NULL, Kilometragem REAL NOT NULL, Ano NUMERIC(4,0) NOT NULL );",
"inserir_dados_sql":"INSERT INTO vendas (Empresa, Modelo, Preco, Kilometragem, Ano) VALUES ('Empresa','Modelo', Preco,Kilometragem,Ano)"
python web-scraping sqlite beautifulsoup selenium
I am developing code for extracting data from a used car sales site. There are 4 sites in total. In 3 of them I use requests and beautifulsoup. The time taken to extract data from these sites was satisfactory. The problem is that the extraction of data from the site whose class is called Localiza
is taking too long. It takes almost 20 minutes to extract sales data from the 6000 cars. Could anyone any tips on how to reduce the time of scraping this site?
This is the parent class of class Localiza
. Forgive me for the lack of indentation in the name of the class. Guilty of the question editor, I could not fix it.
from bs4 import BeautifulSoup as bs
from selenium import webdriver as wb
import requests as req
class SiteVendaSeminovos:
def __init__(self, url, emprise_name):
self.__base_url = url
self.__page_index = 1
self.__emprise_name = emprise_name
self.__soup = None
def goto_next_page(self):
self.__page_index += 1
@property
def base_url(self):
return self.__base_url
@property
def page_index(self):
return self.__page_index
@property
def soup(self):
return self.__soup
def set_soup(self):
r = req.get(self.__base_url.format(self.__page_index))
self.__soup = bs(r.text, "lxml")
def is_finished(self):
pass
def get_cars(self):
pass
@property
def emprise_name(self):
return self.__emprise_name
def get_price(self, car):
pass
def get_kilometragem(self, car):
pass
def get_model(self, car):
pass
def get_year(self, car):
pass
This is the class Localiza
, which takes 20 minutes to get data from 400 pages, plus or minus 6000 cars. Again, sorry for the lack of indentation in the class name.
class Localiza(SiteVendaSeminovos):
def __init__(self, url, emprise_name):
super().__init__(url, emprise_name)
#self.__web_driver = wb.Chrome("/home/rafa/Documentos/web-scrap/chromedriver")
self.__web_driver = wb.PhantomJS("/home/rafa/Documentos/web-scrap/phantomjs")
self.__web_driver.get(url)
self.__id_next_page = "ctl00_ctl42_g_f221d036_75d3_4ee2_893d_0d7b40180245_ProximaPaginaSuperior"
self.__finished = False
def set_soup(self):
self.__soup = bs(self.__web_driver.page_source, "lxml")
def is_finished(self):
return self.__finished
def get_cars(self):
price = self.__soup.find_all(class_="busca-right-container")
cars = self.__soup.find_all(class_="busca-left-container")
return [(price, list(car.stripped_strings)) for car, price in zip(cars, price)]
def get_year(self, car):
return car[1][1].split("/")[0]
def get_kilometragem(self, car):
return car[1][2]
def get_model(self, car):
return car[1][0]
def get_price(self, car):
return list(car[0].stripped_strings)[0][3:].replace(".", "")
def goto_next_page(self):
try:
self.__web_driver.find_element_by_id(self.__id_next_page).click()
except:
self.__finished = True
Main:
from classes import *
import sqlite3 as sqlt
import json
with sqlt.connect("seminovos.db"), json.load(open("urls_sql.json")) as urls_sql:
cursor = con.cursor()
cursor.execute(urls_sql["criar_tabela_sql"]) #cria a tabela "venda" no DB
inserir_dados_sql = urls_sql["inserir_dados_sql"] # Seta a query de insercao de dados na tabela do DB
localiza = Localiza(urls_sql["url_localiza"], "Localiza")
data =
for seller_site in [localiza]:
while True:
seller_site.set_soup()
if seller_site.is_finished():
break
for car in seller_site.get_cars():
data["Empresa"] = seller_site.emprise_name
data["Modelo"] = seller_site.get_model(car)
data["Preco"] = seller_site.get_price(car)
data["Kilometragem"] = seller_site.get_kilometragem(car)
data["Ano"] = seller_site.get_year(car)
cursor.execute(inserir_dados_sql.format(**data))
con.commit()
seller_site.goto_next_page()
JSON with the Localiza site URL and SQL queries:
"url_localiza" : "https://seminovos.localiza.com/Paginas/resultado-busca.aspx?ct=4365_2002_8466_8607_8655_4389_2604_2612_8096_1734_4720_8719_3970_7267_7300_2826_8146_5758_6667_565_8167_1307_2108_7478_8875_2372_6698_8220_4777_8234_3159_8987_6018_108_4498_9040_6057_9061_6974_6744_6749_9123_7690_9185_7719_5210_6797_957_9317_9328_9332_9352_7876_2453_9362_1968_9391_9420_5454_3873_3874_4337_1987_1081_6875&st=AL_BA_CE_DF_ES_GO_MA_MG_MS_MT_PA_PB_PE_PI_PR_RJ_RN_RS_SC_SE_SP&yr=2013_2018&pc=20000_425000&fb=W_X_T_%C3%94_A_D_C_L_1_8_F_M_U_O_R_G_B&md=000192_000097_000148_000147_000136_000119_000137_000120_000729_001061_000132_000632_000699_000122_000041_000286_000772_000719_000180_000181_000179_000715_000736_000854_000748_001076_000334_000250_000333_000330_000332_000369_000424_000545_000511_000488_000408_000418_000510_000513_000391_000211_000431_001083_000325_000326_000344_000456_000451_000455_000458_000505_000506_000502_000805_000426_000427_000623_000726_000718_000119_000132_000694_000788_000828_000005_000484_000478_000481_000476_000473_000477_001017_001016_000356_000357_000319_000317_000297_000298_000354_000123_000780_000859_000138_001006_000614_000867_000858_000675_000139_000165_000171_000174_000039_000020_000022_000019_000747_000320_000365_000311_000355_000312_000313_000322_000342_000353_000362_000314_000699_000696_000114_000779_000781_000755_000807_000806_001084_000529_001063_001098_000667_001036_001077_000133_000143_000658_000705_000707_000047",
"criar_tabela_sql" : "CREATE TABLE vendas ( ID INTEGER PRIMARY KEY AUTOINCREMENT, Empresa VARCHAR (12) NOT NULL, Modelo VARCHAR(40) NOT NULL, Preco REAL NOT NULL, Kilometragem REAL NOT NULL, Ano NUMERIC(4,0) NOT NULL );",
"inserir_dados_sql":"INSERT INTO vendas (Empresa, Modelo, Preco, Kilometragem, Ano) VALUES ('Empresa','Modelo', Preco,Kilometragem,Ano)"
python web-scraping sqlite beautifulsoup selenium
edited May 24 at 0:06
Sam Onela
5,75961543
5,75961543
asked May 23 at 23:50
Rafael Ribeiro
61
61
Look into performing profiling on your code to to identify the bottlenecks. If it cannot identify it, refactor your for loop operations into functions, and I'm pretty sure thefor car in seller_site.get_cars():
loop as the likely suspect.
â C. Harley
May 24 at 1:30
you commit too often. i think it should be safe to commit once the while is done. it may be tough on your ram depending on the amount of data gathered but i guess 6000 cars is not that much
â bobrobbob
May 25 at 15:59
add a comment |Â
Look into performing profiling on your code to to identify the bottlenecks. If it cannot identify it, refactor your for loop operations into functions, and I'm pretty sure thefor car in seller_site.get_cars():
loop as the likely suspect.
â C. Harley
May 24 at 1:30
you commit too often. i think it should be safe to commit once the while is done. it may be tough on your ram depending on the amount of data gathered but i guess 6000 cars is not that much
â bobrobbob
May 25 at 15:59
Look into performing profiling on your code to to identify the bottlenecks. If it cannot identify it, refactor your for loop operations into functions, and I'm pretty sure the
for car in seller_site.get_cars():
loop as the likely suspect.â C. Harley
May 24 at 1:30
Look into performing profiling on your code to to identify the bottlenecks. If it cannot identify it, refactor your for loop operations into functions, and I'm pretty sure the
for car in seller_site.get_cars():
loop as the likely suspect.â C. Harley
May 24 at 1:30
you commit too often. i think it should be safe to commit once the while is done. it may be tough on your ram depending on the amount of data gathered but i guess 6000 cars is not that much
â bobrobbob
May 25 at 15:59
you commit too often. i think it should be safe to commit once the while is done. it may be tough on your ram depending on the amount of data gathered but i guess 6000 cars is not that much
â bobrobbob
May 25 at 15:59
add a comment |Â
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f195053%2fextracting-data-from-a-used-car-sales-site%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Look into performing profiling on your code to to identify the bottlenecks. If it cannot identify it, refactor your for loop operations into functions, and I'm pretty sure the
for car in seller_site.get_cars():
loop as the likely suspect.â C. Harley
May 24 at 1:30
you commit too often. i think it should be safe to commit once the while is done. it may be tough on your ram depending on the amount of data gathered but i guess 6000 cars is not that much
â bobrobbob
May 25 at 15:59