Scraping StackOverflow's questions with Node.js

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
3
down vote

favorite












I'm new to node and web scraping in general but with some research I've written this working code.



My goal is to get all questions from X Stack Overflow's pages sorted by most rated and save that data locally to use on another project to plot charts and other analytical stuff.



How can I improve it in both readability and perfomance?

Any convention or language/technology nuance that I got wrong?



const CHEERIO = require('cheerio');
const REQUEST = require('request');
const JSONFRAME = require('jsonframe-cheerio');
const CORES = require('os').cpus().length;
const CLUSTER = require('cluster');
const FS = require('fs');
const LAST_PAGE_TO_SCRAPE = 10;
const QUANTITY_OF_PAGES_PER_WORKER = 1;

const BASE_URL = 'https://stackoverflow.com/questions?page=';
const URL_ORDERING = '&sort=votes';

if (CLUSTER.isMaster)
let nextPage = 1;
for (let i = 0; i < CORES; i++)
CLUSTER.fork( startingPoint: nextPage );
nextPage += 1;


CLUSTER.on('online', (worker) =>
console.log(`Worker $worker.process.pid is now working.`);
);

CLUSTER.on('exit', (worker, code, signal) =>
if (code !== 0) //restart
console.log(`Worker $worker.process.pid died. Restarting.`);
CLUSTER.fork( startingPoint: worker.process.env.startingPoint );
else //scrape next X pages
console.log(`Worker $worker.process.pid finished it's work succesfully.`);
if (nextPage <= LAST_PAGE_TO_SCRAPE)
CLUSTER.fork( startingPoint: nextPage );
nextPage += QUANTITY_OF_PAGES_PER_WORKER;


);
else
let workerStartingPoint = parseInt(process.env.startingPoint);
for (let i = workerStartingPoint; i < workerStartingPoint + QUANTITY_OF_PAGES_PER_WORKER; i++)
REQUEST(BASE_URL + i + URL_ORDERING, function (error, response, html)
if (error)
process.exit(workerStartingPoint);//error code is where the worker should start again

let $ = CHEERIO.load(html);
JSONFRAME($);
let frame =
questions:
_s: "#questions .question-summary",
_d: [
"votes": ".statscontainer .stats .vote .votes .vote-count-post strong",
"answers": ".statscontainer .stats .status strong",
"views": ".statscontainer .views",
"title": ".summary h3 a",
"tags": [".summary .tags a"],
"url": ".question-hyperlink @ href",
"user":
"name": ".summary .started .user-info .user-details a",
"profile-link": ".summary .started .user-info .user-details a @ href",
"reputation": ".summary .started .user-info .user-details .-flair .reputation-score"
,
"date asked": ".summary .started .user-info .user-action-time .relativetime"
]


let questions = $('body').scrape(frame, string: true );
FS.writeFile('page-' + i + '.json', questions, function (error)
if (error)
process.exit(workerStartingPoint);

process.exit(0);
)
);




If you're going to run this,don't set the LAST_PAGE_TO_SCRAPE too large because your IP is going to get temporarily blocked by SO.







share|improve this question

















  • 4




    Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
    – esote
    Jun 20 at 19:37











  • Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
    – Régis B.
    Jun 20 at 19:40
















up vote
3
down vote

favorite












I'm new to node and web scraping in general but with some research I've written this working code.



My goal is to get all questions from X Stack Overflow's pages sorted by most rated and save that data locally to use on another project to plot charts and other analytical stuff.



How can I improve it in both readability and perfomance?

Any convention or language/technology nuance that I got wrong?



const CHEERIO = require('cheerio');
const REQUEST = require('request');
const JSONFRAME = require('jsonframe-cheerio');
const CORES = require('os').cpus().length;
const CLUSTER = require('cluster');
const FS = require('fs');
const LAST_PAGE_TO_SCRAPE = 10;
const QUANTITY_OF_PAGES_PER_WORKER = 1;

const BASE_URL = 'https://stackoverflow.com/questions?page=';
const URL_ORDERING = '&sort=votes';

if (CLUSTER.isMaster)
let nextPage = 1;
for (let i = 0; i < CORES; i++)
CLUSTER.fork( startingPoint: nextPage );
nextPage += 1;


CLUSTER.on('online', (worker) =>
console.log(`Worker $worker.process.pid is now working.`);
);

CLUSTER.on('exit', (worker, code, signal) =>
if (code !== 0) //restart
console.log(`Worker $worker.process.pid died. Restarting.`);
CLUSTER.fork( startingPoint: worker.process.env.startingPoint );
else //scrape next X pages
console.log(`Worker $worker.process.pid finished it's work succesfully.`);
if (nextPage <= LAST_PAGE_TO_SCRAPE)
CLUSTER.fork( startingPoint: nextPage );
nextPage += QUANTITY_OF_PAGES_PER_WORKER;


);
else
let workerStartingPoint = parseInt(process.env.startingPoint);
for (let i = workerStartingPoint; i < workerStartingPoint + QUANTITY_OF_PAGES_PER_WORKER; i++)
REQUEST(BASE_URL + i + URL_ORDERING, function (error, response, html)
if (error)
process.exit(workerStartingPoint);//error code is where the worker should start again

let $ = CHEERIO.load(html);
JSONFRAME($);
let frame =
questions:
_s: "#questions .question-summary",
_d: [
"votes": ".statscontainer .stats .vote .votes .vote-count-post strong",
"answers": ".statscontainer .stats .status strong",
"views": ".statscontainer .views",
"title": ".summary h3 a",
"tags": [".summary .tags a"],
"url": ".question-hyperlink @ href",
"user":
"name": ".summary .started .user-info .user-details a",
"profile-link": ".summary .started .user-info .user-details a @ href",
"reputation": ".summary .started .user-info .user-details .-flair .reputation-score"
,
"date asked": ".summary .started .user-info .user-action-time .relativetime"
]


let questions = $('body').scrape(frame, string: true );
FS.writeFile('page-' + i + '.json', questions, function (error)
if (error)
process.exit(workerStartingPoint);

process.exit(0);
)
);




If you're going to run this,don't set the LAST_PAGE_TO_SCRAPE too large because your IP is going to get temporarily blocked by SO.







share|improve this question

















  • 4




    Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
    – esote
    Jun 20 at 19:37











  • Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
    – Régis B.
    Jun 20 at 19:40












up vote
3
down vote

favorite









up vote
3
down vote

favorite











I'm new to node and web scraping in general but with some research I've written this working code.



My goal is to get all questions from X Stack Overflow's pages sorted by most rated and save that data locally to use on another project to plot charts and other analytical stuff.



How can I improve it in both readability and perfomance?

Any convention or language/technology nuance that I got wrong?



const CHEERIO = require('cheerio');
const REQUEST = require('request');
const JSONFRAME = require('jsonframe-cheerio');
const CORES = require('os').cpus().length;
const CLUSTER = require('cluster');
const FS = require('fs');
const LAST_PAGE_TO_SCRAPE = 10;
const QUANTITY_OF_PAGES_PER_WORKER = 1;

const BASE_URL = 'https://stackoverflow.com/questions?page=';
const URL_ORDERING = '&sort=votes';

if (CLUSTER.isMaster)
let nextPage = 1;
for (let i = 0; i < CORES; i++)
CLUSTER.fork( startingPoint: nextPage );
nextPage += 1;


CLUSTER.on('online', (worker) =>
console.log(`Worker $worker.process.pid is now working.`);
);

CLUSTER.on('exit', (worker, code, signal) =>
if (code !== 0) //restart
console.log(`Worker $worker.process.pid died. Restarting.`);
CLUSTER.fork( startingPoint: worker.process.env.startingPoint );
else //scrape next X pages
console.log(`Worker $worker.process.pid finished it's work succesfully.`);
if (nextPage <= LAST_PAGE_TO_SCRAPE)
CLUSTER.fork( startingPoint: nextPage );
nextPage += QUANTITY_OF_PAGES_PER_WORKER;


);
else
let workerStartingPoint = parseInt(process.env.startingPoint);
for (let i = workerStartingPoint; i < workerStartingPoint + QUANTITY_OF_PAGES_PER_WORKER; i++)
REQUEST(BASE_URL + i + URL_ORDERING, function (error, response, html)
if (error)
process.exit(workerStartingPoint);//error code is where the worker should start again

let $ = CHEERIO.load(html);
JSONFRAME($);
let frame =
questions:
_s: "#questions .question-summary",
_d: [
"votes": ".statscontainer .stats .vote .votes .vote-count-post strong",
"answers": ".statscontainer .stats .status strong",
"views": ".statscontainer .views",
"title": ".summary h3 a",
"tags": [".summary .tags a"],
"url": ".question-hyperlink @ href",
"user":
"name": ".summary .started .user-info .user-details a",
"profile-link": ".summary .started .user-info .user-details a @ href",
"reputation": ".summary .started .user-info .user-details .-flair .reputation-score"
,
"date asked": ".summary .started .user-info .user-action-time .relativetime"
]


let questions = $('body').scrape(frame, string: true );
FS.writeFile('page-' + i + '.json', questions, function (error)
if (error)
process.exit(workerStartingPoint);

process.exit(0);
)
);




If you're going to run this,don't set the LAST_PAGE_TO_SCRAPE too large because your IP is going to get temporarily blocked by SO.







share|improve this question













I'm new to node and web scraping in general but with some research I've written this working code.



My goal is to get all questions from X Stack Overflow's pages sorted by most rated and save that data locally to use on another project to plot charts and other analytical stuff.



How can I improve it in both readability and perfomance?

Any convention or language/technology nuance that I got wrong?



const CHEERIO = require('cheerio');
const REQUEST = require('request');
const JSONFRAME = require('jsonframe-cheerio');
const CORES = require('os').cpus().length;
const CLUSTER = require('cluster');
const FS = require('fs');
const LAST_PAGE_TO_SCRAPE = 10;
const QUANTITY_OF_PAGES_PER_WORKER = 1;

const BASE_URL = 'https://stackoverflow.com/questions?page=';
const URL_ORDERING = '&sort=votes';

if (CLUSTER.isMaster)
let nextPage = 1;
for (let i = 0; i < CORES; i++)
CLUSTER.fork( startingPoint: nextPage );
nextPage += 1;


CLUSTER.on('online', (worker) =>
console.log(`Worker $worker.process.pid is now working.`);
);

CLUSTER.on('exit', (worker, code, signal) =>
if (code !== 0) //restart
console.log(`Worker $worker.process.pid died. Restarting.`);
CLUSTER.fork( startingPoint: worker.process.env.startingPoint );
else //scrape next X pages
console.log(`Worker $worker.process.pid finished it's work succesfully.`);
if (nextPage <= LAST_PAGE_TO_SCRAPE)
CLUSTER.fork( startingPoint: nextPage );
nextPage += QUANTITY_OF_PAGES_PER_WORKER;


);
else
let workerStartingPoint = parseInt(process.env.startingPoint);
for (let i = workerStartingPoint; i < workerStartingPoint + QUANTITY_OF_PAGES_PER_WORKER; i++)
REQUEST(BASE_URL + i + URL_ORDERING, function (error, response, html)
if (error)
process.exit(workerStartingPoint);//error code is where the worker should start again

let $ = CHEERIO.load(html);
JSONFRAME($);
let frame =
questions:
_s: "#questions .question-summary",
_d: [
"votes": ".statscontainer .stats .vote .votes .vote-count-post strong",
"answers": ".statscontainer .stats .status strong",
"views": ".statscontainer .views",
"title": ".summary h3 a",
"tags": [".summary .tags a"],
"url": ".question-hyperlink @ href",
"user":
"name": ".summary .started .user-info .user-details a",
"profile-link": ".summary .started .user-info .user-details a @ href",
"reputation": ".summary .started .user-info .user-details .-flair .reputation-score"
,
"date asked": ".summary .started .user-info .user-action-time .relativetime"
]


let questions = $('body').scrape(frame, string: true );
FS.writeFile('page-' + i + '.json', questions, function (error)
if (error)
process.exit(workerStartingPoint);

process.exit(0);
)
);




If you're going to run this,don't set the LAST_PAGE_TO_SCRAPE too large because your IP is going to get temporarily blocked by SO.









share|improve this question












share|improve this question




share|improve this question








edited Jun 20 at 20:03









200_success

123k14143399




123k14143399









asked Jun 20 at 19:16









Régis B.

162




162







  • 4




    Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
    – esote
    Jun 20 at 19:37











  • Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
    – Régis B.
    Jun 20 at 19:40












  • 4




    Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
    – esote
    Jun 20 at 19:37











  • Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
    – Régis B.
    Jun 20 at 19:40







4




4




Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
– esote
Jun 20 at 19:37





Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
– esote
Jun 20 at 19:37













Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
– Régis B.
Jun 20 at 19:40




Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
– Régis B.
Jun 20 at 19:40















active

oldest

votes











Your Answer




StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);



);








 

draft saved


draft discarded


















StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f196916%2fscraping-stackoverflows-questions-with-node-js%23new-answer', 'question_page');

);

Post as a guest



































active

oldest

votes













active

oldest

votes









active

oldest

votes






active

oldest

votes










 

draft saved


draft discarded


























 


draft saved


draft discarded














StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f196916%2fscraping-stackoverflows-questions-with-node-js%23new-answer', 'question_page');

);

Post as a guest













































































Popular posts from this blog

Greedy Best First Search implementation in Rust

Function to Return a JSON Like Objects Using VBA Collections and Arrays

C++11 CLH Lock Implementation