Scraping StackOverflow's questions with Node.js

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
3
down vote

favorite

I'm new to node and web scraping in general but with some research I've written this working code.

My goal is to get all questions from X Stack Overflow's pages sorted by most rated and save that data locally to use on another project to plot charts and other analytical stuff.

How can I improve it in both readability and perfomance?

Any convention or language/technology nuance that I got wrong?

const CHEERIO = require('cheerio');
const REQUEST = require('request');
const JSONFRAME = require('jsonframe-cheerio');
const CORES = require('os').cpus().length;
const CLUSTER = require('cluster');
const FS = require('fs');
const LAST_PAGE_TO_SCRAPE = 10;
const QUANTITY_OF_PAGES_PER_WORKER = 1;

const BASE_URL = 'https://stackoverflow.com/questions?page=';
const URL_ORDERING = '&sort=votes';

if (CLUSTER.isMaster) 
 let nextPage = 1;
 for (let i = 0; i < CORES; i++) 
 CLUSTER.fork( startingPoint: nextPage );
 nextPage += 1;
 

 CLUSTER.on('online', (worker) => 
 console.log(`Worker $worker.process.pid is now working.`);
 );

 CLUSTER.on('exit', (worker, code, signal) => 
 if (code !== 0) //restart
 console.log(`Worker $worker.process.pid died. Restarting.`);
 CLUSTER.fork( startingPoint: worker.process.env.startingPoint );
 else //scrape next X pages
 console.log(`Worker $worker.process.pid finished it's work succesfully.`);
 if (nextPage <= LAST_PAGE_TO_SCRAPE) 
 CLUSTER.fork( startingPoint: nextPage );
 nextPage += QUANTITY_OF_PAGES_PER_WORKER;
 
 
 );
 else 
 let workerStartingPoint = parseInt(process.env.startingPoint);
 for (let i = workerStartingPoint; i < workerStartingPoint + QUANTITY_OF_PAGES_PER_WORKER; i++) 
 REQUEST(BASE_URL + i + URL_ORDERING, function (error, response, html) 
 if (error) 
 process.exit(workerStartingPoint);//error code is where the worker should start again
 
 let $ = CHEERIO.load(html);
 JSONFRAME($);
 let frame = 
 questions: 
 _s: "#questions .question-summary",
 _d: [
 "votes": ".statscontainer .stats .vote .votes .vote-count-post strong",
 "answers": ".statscontainer .stats .status strong",
 "views": ".statscontainer .views",
 "title": ".summary h3 a",
 "tags": [".summary .tags a"],
 "url": ".question-hyperlink @ href",
 "user": 
 "name": ".summary .started .user-info .user-details a",
 "profile-link": ".summary .started .user-info .user-details a @ href",
 "reputation": ".summary .started .user-info .user-details .-flair .reputation-score"
 ,
 "date asked": ".summary .started .user-info .user-action-time .relativetime"
 ]
 
 
 let questions = $('body').scrape(frame, string: true );
 FS.writeFile('page-' + i + '.json', questions, function (error) 
 if (error) 
 process.exit(workerStartingPoint);
 
 process.exit(0);
 )
 );

If you're going to run this,don't set the LAST_PAGE_TO_SCRAPE too large because your IP is going to get temporarily blocked by SO.

edited Jun 20 at 20:03

200_success

123k14143399

asked Jun 20 at 19:16

RÃ©gis B.

162

4

Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
â€“Â esote
Jun 20 at 19:37

Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
â€“Â RÃ©gis B.
Jun 20 at 19:40

add a commentÂ |Â

up vote
3
down vote

favorite

I'm new to node and web scraping in general but with some research I've written this working code.

My goal is to get all questions from X Stack Overflow's pages sorted by most rated and save that data locally to use on another project to plot charts and other analytical stuff.

How can I improve it in both readability and perfomance?

Any convention or language/technology nuance that I got wrong?

const CHEERIO = require('cheerio');
const REQUEST = require('request');
const JSONFRAME = require('jsonframe-cheerio');
const CORES = require('os').cpus().length;
const CLUSTER = require('cluster');
const FS = require('fs');
const LAST_PAGE_TO_SCRAPE = 10;
const QUANTITY_OF_PAGES_PER_WORKER = 1;

const BASE_URL = 'https://stackoverflow.com/questions?page=';
const URL_ORDERING = '&sort=votes';

if (CLUSTER.isMaster) 
 let nextPage = 1;
 for (let i = 0; i < CORES; i++) 
 CLUSTER.fork( startingPoint: nextPage );
 nextPage += 1;
 

 CLUSTER.on('online', (worker) => 
 console.log(`Worker $worker.process.pid is now working.`);
 );

 CLUSTER.on('exit', (worker, code, signal) => 
 if (code !== 0) //restart
 console.log(`Worker $worker.process.pid died. Restarting.`);
 CLUSTER.fork( startingPoint: worker.process.env.startingPoint );
 else //scrape next X pages
 console.log(`Worker $worker.process.pid finished it's work succesfully.`);
 if (nextPage <= LAST_PAGE_TO_SCRAPE) 
 CLUSTER.fork( startingPoint: nextPage );
 nextPage += QUANTITY_OF_PAGES_PER_WORKER;
 
 
 );
 else 
 let workerStartingPoint = parseInt(process.env.startingPoint);
 for (let i = workerStartingPoint; i < workerStartingPoint + QUANTITY_OF_PAGES_PER_WORKER; i++) 
 REQUEST(BASE_URL + i + URL_ORDERING, function (error, response, html) 
 if (error) 
 process.exit(workerStartingPoint);//error code is where the worker should start again
 
 let $ = CHEERIO.load(html);
 JSONFRAME($);
 let frame = 
 questions: 
 _s: "#questions .question-summary",
 _d: [
 "votes": ".statscontainer .stats .vote .votes .vote-count-post strong",
 "answers": ".statscontainer .stats .status strong",
 "views": ".statscontainer .views",
 "title": ".summary h3 a",
 "tags": [".summary .tags a"],
 "url": ".question-hyperlink @ href",
 "user": 
 "name": ".summary .started .user-info .user-details a",
 "profile-link": ".summary .started .user-info .user-details a @ href",
 "reputation": ".summary .started .user-info .user-details .-flair .reputation-score"
 ,
 "date asked": ".summary .started .user-info .user-action-time .relativetime"
 ]
 
 
 let questions = $('body').scrape(frame, string: true );
 FS.writeFile('page-' + i + '.json', questions, function (error) 
 if (error) 
 process.exit(workerStartingPoint);
 
 process.exit(0);
 )
 );

If you're going to run this,don't set the LAST_PAGE_TO_SCRAPE too large because your IP is going to get temporarily blocked by SO.

edited Jun 20 at 20:03

200_success

123k14143399

asked Jun 20 at 19:16

RÃ©gis B.

162

4

Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
â€“Â esote
Jun 20 at 19:37

Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
â€“Â RÃ©gis B.
Jun 20 at 19:40

add a commentÂ |Â

up vote
3
down vote

favorite

I'm new to node and web scraping in general but with some research I've written this working code.

My goal is to get all questions from X Stack Overflow's pages sorted by most rated and save that data locally to use on another project to plot charts and other analytical stuff.

How can I improve it in both readability and perfomance?

Any convention or language/technology nuance that I got wrong?

const CHEERIO = require('cheerio');
const REQUEST = require('request');
const JSONFRAME = require('jsonframe-cheerio');
const CORES = require('os').cpus().length;
const CLUSTER = require('cluster');
const FS = require('fs');
const LAST_PAGE_TO_SCRAPE = 10;
const QUANTITY_OF_PAGES_PER_WORKER = 1;

const BASE_URL = 'https://stackoverflow.com/questions?page=';
const URL_ORDERING = '&sort=votes';

if (CLUSTER.isMaster) 
 let nextPage = 1;
 for (let i = 0; i < CORES; i++) 
 CLUSTER.fork( startingPoint: nextPage );
 nextPage += 1;
 

 CLUSTER.on('online', (worker) => 
 console.log(`Worker $worker.process.pid is now working.`);
 );

 CLUSTER.on('exit', (worker, code, signal) => 
 if (code !== 0) //restart
 console.log(`Worker $worker.process.pid died. Restarting.`);
 CLUSTER.fork( startingPoint: worker.process.env.startingPoint );
 else //scrape next X pages
 console.log(`Worker $worker.process.pid finished it's work succesfully.`);
 if (nextPage <= LAST_PAGE_TO_SCRAPE) 
 CLUSTER.fork( startingPoint: nextPage );
 nextPage += QUANTITY_OF_PAGES_PER_WORKER;
 
 
 );
 else 
 let workerStartingPoint = parseInt(process.env.startingPoint);
 for (let i = workerStartingPoint; i < workerStartingPoint + QUANTITY_OF_PAGES_PER_WORKER; i++) 
 REQUEST(BASE_URL + i + URL_ORDERING, function (error, response, html) 
 if (error) 
 process.exit(workerStartingPoint);//error code is where the worker should start again
 
 let $ = CHEERIO.load(html);
 JSONFRAME($);
 let frame = 
 questions: 
 _s: "#questions .question-summary",
 _d: [
 "votes": ".statscontainer .stats .vote .votes .vote-count-post strong",
 "answers": ".statscontainer .stats .status strong",
 "views": ".statscontainer .views",
 "title": ".summary h3 a",
 "tags": [".summary .tags a"],
 "url": ".question-hyperlink @ href",
 "user": 
 "name": ".summary .started .user-info .user-details a",
 "profile-link": ".summary .started .user-info .user-details a @ href",
 "reputation": ".summary .started .user-info .user-details .-flair .reputation-score"
 ,
 "date asked": ".summary .started .user-info .user-action-time .relativetime"
 ]
 
 
 let questions = $('body').scrape(frame, string: true );
 FS.writeFile('page-' + i + '.json', questions, function (error) 
 if (error) 
 process.exit(workerStartingPoint);
 
 process.exit(0);
 )
 );

If you're going to run this,don't set the LAST_PAGE_TO_SCRAPE too large because your IP is going to get temporarily blocked by SO.

edited Jun 20 at 20:03

200_success

123k14143399

asked Jun 20 at 19:16

RÃ©gis B.

162

I'm new to node and web scraping in general but with some research I've written this working code.

My goal is to get all questions from X Stack Overflow's pages sorted by most rated and save that data locally to use on another project to plot charts and other analytical stuff.

How can I improve it in both readability and perfomance?

Any convention or language/technology nuance that I got wrong?

const CHEERIO = require('cheerio');
const REQUEST = require('request');
const JSONFRAME = require('jsonframe-cheerio');
const CORES = require('os').cpus().length;
const CLUSTER = require('cluster');
const FS = require('fs');
const LAST_PAGE_TO_SCRAPE = 10;
const QUANTITY_OF_PAGES_PER_WORKER = 1;

const BASE_URL = 'https://stackoverflow.com/questions?page=';
const URL_ORDERING = '&sort=votes';

if (CLUSTER.isMaster) 
 let nextPage = 1;
 for (let i = 0; i < CORES; i++) 
 CLUSTER.fork( startingPoint: nextPage );
 nextPage += 1;
 

 CLUSTER.on('online', (worker) => 
 console.log(`Worker $worker.process.pid is now working.`);
 );

 CLUSTER.on('exit', (worker, code, signal) => 
 if (code !== 0) //restart
 console.log(`Worker $worker.process.pid died. Restarting.`);
 CLUSTER.fork( startingPoint: worker.process.env.startingPoint );
 else //scrape next X pages
 console.log(`Worker $worker.process.pid finished it's work succesfully.`);
 if (nextPage <= LAST_PAGE_TO_SCRAPE) 
 CLUSTER.fork( startingPoint: nextPage );
 nextPage += QUANTITY_OF_PAGES_PER_WORKER;
 
 
 );
 else 
 let workerStartingPoint = parseInt(process.env.startingPoint);
 for (let i = workerStartingPoint; i < workerStartingPoint + QUANTITY_OF_PAGES_PER_WORKER; i++) 
 REQUEST(BASE_URL + i + URL_ORDERING, function (error, response, html) 
 if (error) 
 process.exit(workerStartingPoint);//error code is where the worker should start again
 
 let $ = CHEERIO.load(html);
 JSONFRAME($);
 let frame = 
 questions: 
 _s: "#questions .question-summary",
 _d: [
 "votes": ".statscontainer .stats .vote .votes .vote-count-post strong",
 "answers": ".statscontainer .stats .status strong",
 "views": ".statscontainer .views",
 "title": ".summary h3 a",
 "tags": [".summary .tags a"],
 "url": ".question-hyperlink @ href",
 "user": 
 "name": ".summary .started .user-info .user-details a",
 "profile-link": ".summary .started .user-info .user-details a @ href",
 "reputation": ".summary .started .user-info .user-details .-flair .reputation-score"
 ,
 "date asked": ".summary .started .user-info .user-action-time .relativetime"
 ]
 
 
 let questions = $('body').scrape(frame, string: true );
 FS.writeFile('page-' + i + '.json', questions, function (error) 
 if (error) 
 process.exit(workerStartingPoint);
 
 process.exit(0);
 )
 );

If you're going to run this,don't set the LAST_PAGE_TO_SCRAPE too large because your IP is going to get temporarily blocked by SO.

edited Jun 20 at 20:03

200_success

123k14143399

asked Jun 20 at 19:16

RÃ©gis B.

162

edited Jun 20 at 20:03

200_success

123k14143399

edited Jun 20 at 20:03

200_success

123k14143399

edited Jun 20 at 20:03

200_success

123k14143399

asked Jun 20 at 19:16

RÃ©gis B.

162

asked Jun 20 at 19:16

RÃ©gis B.

162

asked Jun 20 at 19:16

RÃ©gis B.

162

4

Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
â€“Â esote
Jun 20 at 19:37

Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
â€“Â RÃ©gis B.
Jun 20 at 19:40

add a commentÂ |Â

4

Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
â€“Â esote
Jun 20 at 19:37

Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
â€“Â RÃ©gis B.
Jun 20 at 19:40

Stack Exchange has an API. It may be faster than web scraping, considering you could just construct a request for whatever you'd want. That being said, I don't know how hard it is to get an API key, and writing a web scraper is a fun learning experience.
â€“Â esote
Jun 20 at 19:37

Oh,thanks a lot for that! I'll use it to earn experience working with API's but the main idea of this was to train writing web scraper.
â€“Â RÃ©gis B.
Jun 20 at 19:40

add a commentÂ |Â

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f196916%2fscraping-stackoverflows-questions-with-node-js%23new-answer', 'question_page');

);

Post as a guest

Name

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr