Scraping StackOverflow's questions with Node.js
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
3
down vote
favorite
I'm new to node and web scraping in general but with some research I've written this working code.
My goal is to get all questions from X Stack Overflow's pages sorted by most rated and save that data locally to use on another project to plot charts and other analytical stuff.
How can I improve it in both readability and perfomance?
Any convention or language/technology nuance that I got wrong?
const CHEERIO = require('cheerio');
const REQUEST = require('request');
const JSONFRAME = require('jsonframe-cheerio');
const CORES = require('os').cpus().length;
const CLUSTER = require('cluster');
const FS = require('fs');
const LAST_PAGE_TO_SCRAPE = 10;
const QUANTITY_OF_PAGES_PER_WORKER = 1;
const BASE_URL = 'https://stackoverflow.com/questions?page=';
const URL_ORDERING = '&sort=votes';
if (CLUSTER.isMaster)
let nextPage = 1;
for (let i = 0; i < CORES; i++)
CLUSTER.fork( startingPoint: nextPage );
nextPage += 1;
CLUSTER.on('online', (worker) =>
console.log(`Worker $worker.process.pid is now working.`);
);
CLUSTER.on('exit', (worker, code, signal) =>
if (code !== 0) //restart
console.log(`Worker $worker.process.pid died. Restarting.`);
CLUSTER.fork( startingPoint: worker.process.env.startingPoint );
else //scrape next X pages
console.log(`Worker $worker.process.pid finished it's work succesfully.`);
if (nextPage <= LAST_PAGE_TO_SCRAPE)
CLUSTER.fork( startingPoint: nextPage );
nextPage += QUANTITY_OF_PAGES_PER_WORKER;
);
else
let workerStartingPoint = parseInt(process.env.startingPoint);
for (let i = workerStartingPoint; i < workerStartingPoint + QUANTITY_OF_PAGES_PER_WORKER; i++)
REQUEST(BASE_URL + i + URL_ORDERING, function (error, response, html)
if (error)
process.exit(workerStartingPoint);//error code is where the worker should start again
let $ = CHEERIO.load(html);
JSONFRAME($);
let frame =
questions:
_s: "#questions .question-summary",
_d: [
"votes": ".statscontainer .stats .vote .votes .vote-count-post strong",
"answers": ".statscontainer .stats .status strong",
"views": ".statscontainer .views",
"title": ".summary h3 a",
"tags": [".summary .tags a"],
"url": ".question-hyperlink @ href",
"user":
"name": ".summary .started .user-info .user-details a",
"profile-link": ".summary .started .user-info .user-details a @ href",
"reputation": ".summary .started .user-info .user-details .-flair .reputation-score"
,
"date asked": ".summary .started .user-info .user-action-time .relativetime"
]
let questions = $('body').scrape(frame, string: true );
FS.writeFile('page-' + i + '.json', questions, function (error)
if (error)
process.exit(workerStartingPoint);
process.exit(0);
)
);
If you're going to run this,don't set the LAST_PAGE_TO_SCRAPE
too large because your IP is going to get temporarily blocked by SO.
node.js web-scraping ecmascript-6 stackexchange
add a comment |Â
up vote
3
down vote
favorite
I'm new to node and web scraping in general but with some research I've written this working code.
My goal is to get all questions from X Stack Overflow's pages sorted by most rated and save that data locally to use on another project to plot charts and other analytical stuff.
How can I improve it in both readability and perfomance?
Any convention or language/technology nuance that I got wrong?
const CHEERIO = require('cheerio');
const REQUEST = require('request');
const JSONFRAME = require('jsonframe-cheerio');
const CORES = require('os').cpus().length;
const CLUSTER = require('cluster');
const FS = require('fs');
const LAST_PAGE_TO_SCRAPE = 10;
const QUANTITY_OF_PAGES_PER_WORKER = 1;
const BASE_URL = 'https://stackoverflow.com/questions?page=';
const URL_ORDERING = '&sort=votes';
if (CLUSTER.isMaster)
let nextPage = 1;
for (let i = 0; i < CORES; i++)
CLUSTER.fork( startingPoint: nextPage );
nextPage += 1;
CLUSTER.on('online', (worker) =>
console.log(`Worker $worker.process.pid is now working.`);
);
CLUSTER.on('exit', (worker, code, signal) =>
if (code !== 0) //restart
console.log(`Worker $worker.process.pid died. Restarting.`);
CLUSTER.fork( startingPoint: worker.process.env.startingPoint );
else //scrape next X pages
console.log(`Worker $worker.process.pid finished it's work succesfully.`);
if (nextPage <= LAST_PAGE_TO_SCRAPE)
CLUSTER.fork( startingPoint: nextPage );
nextPage += QUANTITY_OF_PAGES_PER_WORKER;
);
else
let workerStartingPoint = parseInt(process.env.startingPoint);
for (let i = workerStartingPoint; i < workerStartingPoint + QUANTITY_OF_PAGES_PER_WORKER; i++)
REQUEST(BASE_URL + i + URL_ORDERING, function (error, response, html)
if (error)
process.exit(workerStartingPoint);//error code is where the worker should start again
let $ = CHEERIO.load(html);
JSONFRAME($);
let frame =
questions:
_s: "#questions .question-summary",
_d: [
"votes": ".statscontainer .stats .vote .votes .vote-count-post strong",
"answers": ".statscontainer .stats .status strong",
"views": ".statscontainer .views",
"title": ".summary h3 a",
"tags": [".summary .tags a"],
"url": ".question-hyperlink @ href",
"user":
"name": ".summary .started .user-info .user-details a",
"profile-link": ".summary .started .user-info .user-details a @ href",
"reputation": ".summary .started .user-info .user-details .-flair .reputation-score"
,
"date asked": ".summary .started .user-info .user-action-time .relativetime"
]
let questions = $('body').scrape(frame, string: true );
FS.writeFile('page-' + i + '.json', questions, function (error)
if (error)
process.exit(workerStartingPoint);
process.exit(0);
)
);
If you're going to run this,don't set the LAST_PAGE_TO_SCRAPE
too large because your IP is going to get temporarily blocked by SO.
node.js web-scraping ecmascript-6 stackexchange
4
Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
â esote
Jun 20 at 19:37
Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
â Régis B.
Jun 20 at 19:40
add a comment |Â
up vote
3
down vote
favorite
up vote
3
down vote
favorite
I'm new to node and web scraping in general but with some research I've written this working code.
My goal is to get all questions from X Stack Overflow's pages sorted by most rated and save that data locally to use on another project to plot charts and other analytical stuff.
How can I improve it in both readability and perfomance?
Any convention or language/technology nuance that I got wrong?
const CHEERIO = require('cheerio');
const REQUEST = require('request');
const JSONFRAME = require('jsonframe-cheerio');
const CORES = require('os').cpus().length;
const CLUSTER = require('cluster');
const FS = require('fs');
const LAST_PAGE_TO_SCRAPE = 10;
const QUANTITY_OF_PAGES_PER_WORKER = 1;
const BASE_URL = 'https://stackoverflow.com/questions?page=';
const URL_ORDERING = '&sort=votes';
if (CLUSTER.isMaster)
let nextPage = 1;
for (let i = 0; i < CORES; i++)
CLUSTER.fork( startingPoint: nextPage );
nextPage += 1;
CLUSTER.on('online', (worker) =>
console.log(`Worker $worker.process.pid is now working.`);
);
CLUSTER.on('exit', (worker, code, signal) =>
if (code !== 0) //restart
console.log(`Worker $worker.process.pid died. Restarting.`);
CLUSTER.fork( startingPoint: worker.process.env.startingPoint );
else //scrape next X pages
console.log(`Worker $worker.process.pid finished it's work succesfully.`);
if (nextPage <= LAST_PAGE_TO_SCRAPE)
CLUSTER.fork( startingPoint: nextPage );
nextPage += QUANTITY_OF_PAGES_PER_WORKER;
);
else
let workerStartingPoint = parseInt(process.env.startingPoint);
for (let i = workerStartingPoint; i < workerStartingPoint + QUANTITY_OF_PAGES_PER_WORKER; i++)
REQUEST(BASE_URL + i + URL_ORDERING, function (error, response, html)
if (error)
process.exit(workerStartingPoint);//error code is where the worker should start again
let $ = CHEERIO.load(html);
JSONFRAME($);
let frame =
questions:
_s: "#questions .question-summary",
_d: [
"votes": ".statscontainer .stats .vote .votes .vote-count-post strong",
"answers": ".statscontainer .stats .status strong",
"views": ".statscontainer .views",
"title": ".summary h3 a",
"tags": [".summary .tags a"],
"url": ".question-hyperlink @ href",
"user":
"name": ".summary .started .user-info .user-details a",
"profile-link": ".summary .started .user-info .user-details a @ href",
"reputation": ".summary .started .user-info .user-details .-flair .reputation-score"
,
"date asked": ".summary .started .user-info .user-action-time .relativetime"
]
let questions = $('body').scrape(frame, string: true );
FS.writeFile('page-' + i + '.json', questions, function (error)
if (error)
process.exit(workerStartingPoint);
process.exit(0);
)
);
If you're going to run this,don't set the LAST_PAGE_TO_SCRAPE
too large because your IP is going to get temporarily blocked by SO.
node.js web-scraping ecmascript-6 stackexchange
I'm new to node and web scraping in general but with some research I've written this working code.
My goal is to get all questions from X Stack Overflow's pages sorted by most rated and save that data locally to use on another project to plot charts and other analytical stuff.
How can I improve it in both readability and perfomance?
Any convention or language/technology nuance that I got wrong?
const CHEERIO = require('cheerio');
const REQUEST = require('request');
const JSONFRAME = require('jsonframe-cheerio');
const CORES = require('os').cpus().length;
const CLUSTER = require('cluster');
const FS = require('fs');
const LAST_PAGE_TO_SCRAPE = 10;
const QUANTITY_OF_PAGES_PER_WORKER = 1;
const BASE_URL = 'https://stackoverflow.com/questions?page=';
const URL_ORDERING = '&sort=votes';
if (CLUSTER.isMaster)
let nextPage = 1;
for (let i = 0; i < CORES; i++)
CLUSTER.fork( startingPoint: nextPage );
nextPage += 1;
CLUSTER.on('online', (worker) =>
console.log(`Worker $worker.process.pid is now working.`);
);
CLUSTER.on('exit', (worker, code, signal) =>
if (code !== 0) //restart
console.log(`Worker $worker.process.pid died. Restarting.`);
CLUSTER.fork( startingPoint: worker.process.env.startingPoint );
else //scrape next X pages
console.log(`Worker $worker.process.pid finished it's work succesfully.`);
if (nextPage <= LAST_PAGE_TO_SCRAPE)
CLUSTER.fork( startingPoint: nextPage );
nextPage += QUANTITY_OF_PAGES_PER_WORKER;
);
else
let workerStartingPoint = parseInt(process.env.startingPoint);
for (let i = workerStartingPoint; i < workerStartingPoint + QUANTITY_OF_PAGES_PER_WORKER; i++)
REQUEST(BASE_URL + i + URL_ORDERING, function (error, response, html)
if (error)
process.exit(workerStartingPoint);//error code is where the worker should start again
let $ = CHEERIO.load(html);
JSONFRAME($);
let frame =
questions:
_s: "#questions .question-summary",
_d: [
"votes": ".statscontainer .stats .vote .votes .vote-count-post strong",
"answers": ".statscontainer .stats .status strong",
"views": ".statscontainer .views",
"title": ".summary h3 a",
"tags": [".summary .tags a"],
"url": ".question-hyperlink @ href",
"user":
"name": ".summary .started .user-info .user-details a",
"profile-link": ".summary .started .user-info .user-details a @ href",
"reputation": ".summary .started .user-info .user-details .-flair .reputation-score"
,
"date asked": ".summary .started .user-info .user-action-time .relativetime"
]
let questions = $('body').scrape(frame, string: true );
FS.writeFile('page-' + i + '.json', questions, function (error)
if (error)
process.exit(workerStartingPoint);
process.exit(0);
)
);
If you're going to run this,don't set the LAST_PAGE_TO_SCRAPE
too large because your IP is going to get temporarily blocked by SO.
node.js web-scraping ecmascript-6 stackexchange
edited Jun 20 at 20:03
200_success
123k14143399
123k14143399
asked Jun 20 at 19:16
Régis B.
162
162
4
Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
â esote
Jun 20 at 19:37
Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
â Régis B.
Jun 20 at 19:40
add a comment |Â
4
Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
â esote
Jun 20 at 19:37
Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
â Régis B.
Jun 20 at 19:40
4
4
Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
â esote
Jun 20 at 19:37
Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
â esote
Jun 20 at 19:37
Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
â Régis B.
Jun 20 at 19:40
Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
â Régis B.
Jun 20 at 19:40
add a comment |Â
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f196916%2fscraping-stackoverflows-questions-with-node-js%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
4
Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
â esote
Jun 20 at 19:37
Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
â Régis B.
Jun 20 at 19:40