我在Node.js上构建了一个简单的Web抓取工具。我也在使用Express.js,Request和Cheerio。我正在尝试抓取以下网址:http://www.houzz.com/professionals/c/Nashville,-TN

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();

app.get('/scrape', function(req, res){

    url = 'http://www.houzz.com/professionals/c/Nashville,-TN';

    request(url, function(error, response, html){
        //console.log(html)
        if(!error){
            var $ = cheerio.load(html);
            var title = [], contact = [], review = [], description = [];
            var json = { title : "", contact : "", review: "", description: ""};

            $('.pro-title').filter(function(){
                var data = $(this);
                title.push(data.text());
                console.log(title);
                json.title = title;
            });

            $('.pro-phone').filter(function(){
                var data = $(this);
                contact.push(data.text());
                console.log(contact);
                json.contact = contact;
            });

            $('.pro-review-string').filter(function(){
                var data = $(this);
                review.push(data.children().first().text());
                json.review = review;
            });

            $('.pro-description').filter(function(){
                var data = $(this);
                description.push(data.text());
                json.description = description;
            });
        }

        fs.writeFile('houzz.json', JSON.stringify(json, null, 4), function(err){
            console.log('File successfully written! - Check your project directory for the houzz.json file');
        });

        res.send('Check your console!');
    }) ;
});

app.listen('8081');
console.log('Port 8081');
exports = module.exports = app;


输出的houzz.json文件的示例如下所示:

{
    "title": [
        "Marcelle Guilbeau, Interior Designer",
        "Country Flooring DIrect",
        "Eric Ross Interiors, LLC",
        "Hermitage Kitchen Design Gallery",
        "William Johnson Architect",
],
    "contact": [
        "(615) 815-9309",
        "(615) 646-0366",
        "(615) 472-8236",
        "(615) 843-3310",
        "(615) 292-4017",

    ],
    "review": [
        "77",
        "1",
        "14",
        "14",
        "15",
],
    "description": [
        "Marcelle takes her clients on a journey, drawing out their needs to create an oasis that reflects their personal sense of style and renews their connection to those things about...\t\t\tRead More\n\t\t\t",
        "Country Flooring Direct is the local flooring option that will handle your flooring needs. Give Country Flooring Direct a call and find out why lower overhead means lower prices.\t\t\tSee my projects\n\t\t",
        "Eric Ross Interiors exists to create beautiful interiors and a luxury design experience for its clients.  We are committed to creating whole room environments for our clients in...\t\t\tRead More\n\t\t\t",
        "We are a total design center that offers the finest in custom cabinetry, with the best possible level of creativity, design and service. We are located within Hermitage Lighting Gallery.\t\t\tSee my projects\n\t\t",
        "William C. Johnson Architect, LLC is a small, full service architectural design firm. Since 1985, WCJA has helped clients achieve their design goals, from small residential...\t\t\tRead More\n\t\t\t",
]
}


如何重组houzz.json文件并使它看起来像这样:

{
0:
  [
    title:
    contact:
    review:
    description:
1:
  [
    title:
    contact:
    review:
    description:
   ]

最佳答案

在我看来,您正在以无序的方式获取内容。

您应该获取每个“ vcard”类,然后遍历其中的所需元素。 (专业名称,专业电话等)

DOM元素的结构已经可以帮助您使其井井有条。

vcard
    pro-title
    pro-phone
    pro-review-string
    pro-description

vcard
    pro-title
    pro-phone
    pro-review-string
    pro-description


因此代码看起来像这样。您可能需要使用$(this).find()

  var allmycards=[];
 $('.vcard').each(function (i, elem) {
     var title = [], contact = [], review = [], description = [];
     var json = { title : "", contact : "", review: "", description: ""};
    $(this).find('.pro-title').filter(function(){
            var data = $(this);
            title.push(data.text());
            console.log(title);
            json.title = title;
        });

        $(this).find('.pro-phone').filter(function(){
            var data = $(this);
            contact.push(data.text());
            console.log(contact);
            json.contact = contact;
        });

         $(this).find('.pro-review-string').filter(function(){
            var data = $(this);
            review.push(data.children().first().text());
            json.review = review;
        });

         $(this).find('.pro-description').filter(function(){
            var data = $(this);
            description.push(data.text());
            json.description = description;
        });
        allmycards.push(json);
});

07-26 03:41