Node实践之爬数据篇!~

本意是想写一些接口感受一下Nodejs魅力,奈何没有数据,延伸了一下爬虫数据采集,让咱的数据库不太假。


cheerio 数据截取

是的,没想到有一天咱也会爬别人玩~ 哈哈哈!

  • 引入模块为了搞数据
1
const cheerio = require("cheerio");
2
const axios = require("axios").default;

飘过注意!cheerio的元素选择方式和jQuery一模一样!

  • 向爬取的url对象发起请求并进行元素选择

Image.png


了解cheerio完毕,下面正式开始咯!

  • 发起一波模块引入
1
const cheerio = require("cheerio");
2
const axios = require("axios").default;
3
const mongoose = require("mongoose");
4
const Entities = require("html-entities").XmlEntities;
5
const entities = new Entities();
6
const { Product, Detail, Category } = require("./models");
  • 连接数据库
1
mongoose.connect("mongodb://localhost:27017/MangoStore-app", {
2
    useUnifiedTopology: true,
3
    useNewUrlParser: true
4
}).then(res => {
5
    const removeAll = Promise.all([
6
        Product.remove({}),
7
        Detail.remove({}),
8
        Category.remove({})
9
    ]);
10
    removeAll.then(res => {
11
        loadData();
12
    });
13
});
  • 定义一个“安全”的请求方法准备发起进攻
1
function fetchData(url) {
2
    const userAgents = [
3
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
4
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
5
        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
6
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
7
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
8
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"
9
    ];
10
    return new Promise((resolve, reject) => {
11
        setTimeout(function() {
12
            axios
13
                .get(url, {
14
                    timeout: 500000,
15
                    headers: {
16
                        "user-agent": userAgents[Math.floor(Math.random() * userAgents.length)],
17
                        "X-FORWARDED-FOR": Math.floor(Math.random() * 255) +
18
                            "." +
19
                            Math.floor(Math.random() * 255) +
20
                            "." +
21
                            Math.floor(Math.random() * 255) +
22
                            "." +
23
                            Math.floor(Math.random() * 255),
24
                        "CLIENT-IP": Math.floor(Math.random() * 255) +
25
                            "." +
26
                            Math.floor(Math.random() * 255) +
27
                            "." +
28
                            Math.floor(Math.random() * 255) +
29
                            "." +
30
                            Math.floor(Math.random() * 255)
31
                    }
32
33
                }).then(res => resolve(res))
34
        }, Math.random() * 100 * Math.random());
35
    });
36
}

我的对象是一个服装商场,在此俺先对着分类下手了

1
function loadData() {
2
3
    const types = [{
4
            name: "女鞋",
5
            url: "https://www.yougou.com/f-0-MXZ-0-1.html"
6
        },
7
        {
8
            name: "男鞋",
9
            url: "https://www.yougou.com/f-0-Y0A-04Y004-1.html"
10
        }, {
11
            name: "运动",
12
            url: "https://www.yougou.com/f-0-PTK-0-1.html"
13
        },
14
        {
15
            name: "箱包",
16
            url: "https://www.yougou.com/f-0-6LJ-0-1.html"
17
        },
18
        {
19
            name: "儿童",
20
            url: "https://www.yougou.com/f-0-9XB-0-1.html"
21
        }
22
    ];
23
    Category.insertMany(types)
24
        .then(res => {
25
            for (var i = 0; i < res.length; i++) {
26
                loadProduct(res[i]);
27
            }
28
        })
29
30
}

好嘞,分类表数据有了,下面来商品的数据:

1
function loadProduct({
2
    url,
3
    name,
4
    _id
5
}, isFirst = true) {
6
    fetchData(url).then(res => {
7
        const $ = cheerio.load(res.data.toString());
8
        const products = [];
9
        if (isFirst) {
10
            const $tagProducts = $(".proList li");
11
            $tagProducts.each(function() {
12
                const product = {};
13
                product.title = $(this)
14
                    .find('.srchlst-wrap .bd .nptt a')
15
                    .text();
16
                product.imgUrl = $(this)
17
                    .find(".srchlst-wrap .goods-desc .collect")
18
                    .attr("src");
19
                product.CurrentPrice = $(this)
20
                    .find(".srchlst-wrap .bd .price_sc em")
21
                    .eq(2).attr("price");
22
                product.OriginPrice = $(this)
23
                    .find(".srchlst-wrap .bd .origin-price i")
24
                    .text();
25
                product.detailUrl = $(this)
26
                    .find(".srchlst-wrap .goods-head a")
27
                    .attr("href");
28
                product.category = name;
29
                product.id = _id;
30
                products.push(product);
31
            });
32
        } else {
33
            const $tagProducts = $(".proList li");
34
            $tagProducts.each(function(index) {
35
                const product = {};
36
                product.title = $(this)
37
                    .find('.srchlst-wrap .bd .nptt a')
38
                    .text();
39
                product.imgUrl = $(this)
40
                    .find(".srchlst-wrap .goods-desc .collect")
41
                    .attr("src");
42
                product.CurrentPrice = $(this)
43
                    .find(".srchlst-wrap .bd .price_sc em")
44
                    .eq(2).attr("price");
45
                product.OriginPrice = $(this)
46
                    .find(".srchlst-wrap .bd .origin-price i")
47
                    .text();
48
                product.detailUrl = $(this)
49
                    .find(".srchlst-wrap .goods-head a")
50
                    .attr("href");
51
                product.category = name;
52
                product.id = _id;
53
                products.push(product);
54
            });
55
        }
56
        Product.insertMany(products)
57
            .then(res => {
58
                for (var i = 0; i < res.length; i++) {
59
                    loadDt(res[i]);
60
                }
61
            });
62
63
    });
64
}

紧接着,来一些商品详情把~

1
function loadDt({ detailUrl, name, _id, category }) {
2
    fetchData(detailUrl)
3
        .then(res => {
4
            const $ = cheerio.load(res.data.toString());
5
            const proDetails = [];
6
            const $goodsDetails = $("#goodsContainer");
7
            $goodsDetails.each(function() {
8
                let proDetail = {};
9
                proDetail.imgUrl = $(this)
10
                    .find("#goodsImg0 .goodsPic img")
11
                    .attr("src");
12
                proDetail.imgUrlall = [
13
                    $(this)
14
                    .find("#goodsImg0 .goodsPic img")
15
                    .attr("src"),
16
                    $(this)
17
                    .find("#spec-list .list-h li .picSmallClass2")
18
                    .attr("src"),
19
                    $(this)
20
                    .find("#spec-list .list-h li .picSmallClass3")
21
                    .attr("src"),
22
                    $(this)
23
                    .find("#spec-list .list-h li .picSmallClass4")
24
                    .attr("src"),
25
                    $(this)
26
                    .find("#spec-list .list-h li .picSmallClass5")
27
                    .attr("src"),
28
                    $(this)
29
                    .find("#spec-list .list-h li .picSmallClass6")
30
                    .attr("src")
31
                ];
32
                proDetail.title = $(this)
33
                    .find(".shopping-container h1")
34
                    .text();
35
                proDetail.OriginPrice = $(this)
36
                    .find(".shopping-container .good_ygprcarea #ygprice_area del")
37
                    .text();
38
                proDetail.title = $(this)
39
                    .find(".shopping-container h1")
40
                    .text();
41
                proDetail.OriginPrice = $(this)
42
                    .find(".shopping-container .good_ygprcarea #ygprice_area del")
43
                    .text();
44
                proDetail.sizeAll = [
45
                    $(this)
46
                    .find(".size .prosize .prodSpec a")
47
                    .eq(0).attr("data-name"),
48
                    $(this)
49
                    .find(".size .prosize .prodSpec a")
50
                    .eq(1).attr("data-name"),
51
                    $(this)
52
                    .find(".size .prosize .prodSpec a")
53
                    .eq(2).attr("data-name"),
54
                ];
55
56
                console.log(proDetail);
57
                proDetail.id = _id;
58
                proDetail.name = category;
59
                proDetails.push(proDetail);
60
            });
61
            Detail.insertMany(proDetails).then(detEnd => console.log("保存详情成功"));
62
        })
63
}

OK , 够咱玩数据库和接口了,收手走起接口!~