How Can I Rewrite This With Promises?

October 03, 2023 Post a Comment

I am building a content scraper for a tshirt website. The goal is to enter a website through only one hardcoded url: http://shirts4mike.com I will then find all the product pages f

Solution 1:

There is a npm module called request-promise.

simply:

var rp = require("request-promise");

and anywhere you are making a request you can switch with request-promise.

for instance:

rp(url)
.then(function(value){
  //do whatever
})
.catch(function(err){
  console.log(err)
})

Solution 2:

You can use this example to convert the rest of your code sample.

promise = newPromise((resolve, reject) => ( 
    request("http://shirts4mike.com/", 
    (err, response, html) => (response.statusCode == 200 ? resolve(html): reject(err))
)));


promise.then(html => {
    var $ = cheerio.load(html);
    // continue
});

Solution 3:

You can use waterfall method of async module which can give you a smooth way to resolve this issue.

I just try to do your code with this module

Hope this will work for you

Format of waterfall

Baca Juga

async.waterfall([
  function(callback) {
    callback(null, previousvalue);
  },
  function(previousvalue, callback) {}
], function(err, result) { //Final callback

});

varasync = require('async');
var cheerio = require('cheerio');
var request = require('request');
var moment = require('moment');

//hardcoded urlvar url = 'http://shirts4mike.com/';

//url for tshirt pagesvar urlSet = newSet();

var remainder;
var tshirtArray = [];


async.waterfall([
  function(callback) {
    // Load front page of shirts4mikerequest(url, function(error, response, html) {
      if (!error && response.statusCode == 200) {
        var $ = cheerio.load(html);

        //iterate over links with 'shirt'
        $("a[href*=shirt]").each(function() {
          var a = $(this).attr('href');

          //create new linkvar scrapeLink = url + a;

          //for each new link, go in and find out if there is a submit button. //If there, add it to the setrequest(scrapeLink, function(error, response, html) {
            if (!error && response.statusCode == 200) {
              var $ = cheerio.load(html);

              //if page has a submit it must be a product pageif ($('[type=submit]').length !== 0) {

                //add page to set
                urlSet.add(scrapeLink);
                callback(null, true);

              } elseif (remainder === undefined) {
                //if not a product page, add it to remainder so it another scrape can be performed.
                remainder = scrapeLink;
                callback(nul, true);
              }
            }
          });
        });
      }
      //call second scrape for remainder// secondScrape();
    });
  },
  function(previousvalue, callback) {
    request(remainder, function(error, response, html) {
      if (!error && response.statusCode == 200) {
        var $ = cheerio.load(html);

        $("a[href*=shirt]").each(function() {
          var a = $(this).attr('href');

          //create new linkvar scrapeLink = url + a;

          request(scrapeLink, function(error, response, html) {
            if (!error && response.statusCode == 200) {

              var $ = cheerio.load(html);

              //collect remaining product pages and add to setif ($('[type=submit]').length !== 0) {
                urlSet.add(scrapeLink);
              }
              callback(null, true);
            }
          });
        });
      }
    });
    console.log(urlSet);
    //call lastScraper so we can grab data from the set (product pages)
  },
  function(previousvalue, callback) {
    //scrape set, product pagesfor (var i = 0; i < urlSet.length; i++) {
      var url = urlSet[i];

      request(url, function(error, response, html) {
        if (!error && response.statusCode == 200) {
          var $ = cheerio.load(html);

          //grab data and store as variablesvar price = $('.price').text();
          var img = $('.shirt-picture').find("img").attr("src");
          var title = $('body').find(".shirt-details > h1").text().slice(4);

          var tshirtObject = {};
          //add values into tshirt object

          tshirtObject.price = price;
          tshirtObject.img = img;
          tshirtObject.title = title;
          tshirtObject.url = url;
          tshirtObject.date = moment().format('MMMM Do YYYY, h:mm:ss a');

          //add the object into the array of tshirts
          tshirtArray.push(tshirtObject);
        }
      });
    }
  }
], function(err, result) {
  //call function to iterate through tshirt objects in array in order to convert to JSON, then into CSV to be loggedconvertJson2Csv();
});

Solution 4:

You correctly identify promises as a way ahead to solving your timing issues.

In order to have promises available, you need to promisify request (or adopt a HTTP lib, whose methods return promises).

You could just fix the timing issues with promises, but you could also take the opportunity to improve the overall paradigm. Instead of discrete functions for virtually identical first/second/third stages, you can write a single function that calls itself recursively. Written correctly, this will ensure that each page in the target site is visited a maximum of once; revisits should be avoided on grounds of overall performance, and loading of the target server.

//Modules being used:varPromise = require('path/to/bluebird');
var cheerio = require('cheerio');
var moment = require('moment');

// Promisify `request` to make `request.getAsync()` available.// Ref: http://stackoverflow.com/questions/28308131/how-do-you-properly-promisify-requestvar request = Promise.promisify(require('request'));
Promise.promisifyAll(request);

//hardcoded urlvar url = 'http://shirts4mike.com/';

var urlSet = newSet();
var tshirtArray = [];

var maxLevels = 3; // limit the recursion to this number of levels.functionscrapePage(url_, levelCounter) {
    // Bale out if ://   a) the target url_ has been visited already,//   b) maxLevels has been reached.if(urlSet.has(url_) || levelCounter >= maxLevels) {
        returnPromise.resolve();
    }
    urlSet.add(url_);

    return request.getAsync(url_).then(function(response, html) {
        var $;
        if(response.statusCode !== 200) {
            thrownewError('statusCode was not 200'); // will be caught below
        }
        $ = cheerio.load(html);
        if($('[type=submit]').length > 0) {
            // yay, it's a product page.
            tshirtArray.push({
                price: $('.price').text(),
                img: $('.shirt-picture').find("img").attr("src"),
                title: $('body').find(".shirt-details > h1").text().slice(4),
                url: url_,
                date: moment().format('MMMM Do YYYY, h:mm:ss a')
            });
        }
        // find any shirt links on page represented by $, visit each link in turn, and scrape.returnPromise.all($("a[href*=shirt]").map(function(link) {
            returnscrapePage(link.href, levelCounter + 1);
        }).get());
    }).catch(function(e) {
        // ensure "success" even if scraping threw an error.console.log(e);
        returnnull;
    });
}

scrapePage(url, 0).then(convertJson2Csv);

As you can see, a recursive solution :

avoids repetition of code,
will drill down as many levels as you wish - determined by the variable maxLevels.

Note: This is still not a good solution. There's an implicit assumption here, as in the original code, that all shirt pages are reachable from the site's home page, via "shirt" links alone. If shirts were reachable via eg "clothing" > "shirts", then the code above won't find any shirts.

JavaScript General

How Can I Rewrite This With Promises?

Solution 1:

Solution 2:

Solution 3:

Solution 4:

Post a Comment for "How Can I Rewrite This With Promises?"