Node.js-Cheerio
Node实践之爬数据篇!~
本意是想写一些接口感受一下Nodejs魅力,奈何没有数据,延伸了一下爬虫数据采集,让咱的数据库不太假。
cheerio 数据截取
是的,没想到有一天咱也会爬别人玩~ 哈哈哈!
- 引入模块为了搞数据
1 | const cheerio = require("cheerio"); |
2 | const axios = require("axios").default; |
飘过注意!cheerio的元素选择方式和jQuery一模一样!
- 向爬取的url对象发起请求并进行元素选择
了解cheerio完毕,下面正式开始咯!
- 发起一波模块引入
1 | const cheerio = require("cheerio"); |
2 | const axios = require("axios").default; |
3 | const mongoose = require("mongoose"); |
4 | const Entities = require("html-entities").XmlEntities; |
5 | const entities = new Entities(); |
6 | const { Product, Detail, Category } = require("./models"); |
- 连接数据库
1 | mongoose.connect("mongodb://localhost:27017/MangoStore-app", { |
2 | useUnifiedTopology: true, |
3 | useNewUrlParser: true |
4 | }).then(res => { |
5 | const removeAll = Promise.all([ |
6 | Product.remove({}), |
7 | Detail.remove({}), |
8 | Category.remove({}) |
9 | ]); |
10 | removeAll.then(res => { |
11 | loadData(); |
12 | }); |
13 | }); |
- 定义一个“安全”的请求方法准备发起进攻
1 | function fetchData(url) { |
2 | const userAgents = [ |
3 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", |
4 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", |
5 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", |
6 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0", |
7 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", |
8 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)" |
9 | ]; |
10 | return new Promise((resolve, reject) => { |
11 | setTimeout(function() { |
12 | axios |
13 | .get(url, { |
14 | timeout: 500000, |
15 | headers: { |
16 | "user-agent": userAgents[Math.floor(Math.random() * userAgents.length)], |
17 | "X-FORWARDED-FOR": Math.floor(Math.random() * 255) + |
18 | "." + |
19 | Math.floor(Math.random() * 255) + |
20 | "." + |
21 | Math.floor(Math.random() * 255) + |
22 | "." + |
23 | Math.floor(Math.random() * 255), |
24 | "CLIENT-IP": Math.floor(Math.random() * 255) + |
25 | "." + |
26 | Math.floor(Math.random() * 255) + |
27 | "." + |
28 | Math.floor(Math.random() * 255) + |
29 | "." + |
30 | Math.floor(Math.random() * 255) |
31 | } |
32 | |
33 | }).then(res => resolve(res)) |
34 | }, Math.random() * 100 * Math.random()); |
35 | }); |
36 | } |
我的对象是一个服装商场,在此俺先对着分类下手了
1 | function loadData() { |
2 | |
3 | const types = [{ |
4 | name: "女鞋", |
5 | url: "https://www.yougou.com/f-0-MXZ-0-1.html" |
6 | }, |
7 | { |
8 | name: "男鞋", |
9 | url: "https://www.yougou.com/f-0-Y0A-04Y004-1.html" |
10 | }, { |
11 | name: "运动", |
12 | url: "https://www.yougou.com/f-0-PTK-0-1.html" |
13 | }, |
14 | { |
15 | name: "箱包", |
16 | url: "https://www.yougou.com/f-0-6LJ-0-1.html" |
17 | }, |
18 | { |
19 | name: "儿童", |
20 | url: "https://www.yougou.com/f-0-9XB-0-1.html" |
21 | } |
22 | ]; |
23 | Category.insertMany(types) |
24 | .then(res => { |
25 | for (var i = 0; i < res.length; i++) { |
26 | loadProduct(res[i]); |
27 | } |
28 | }) |
29 | |
30 | } |
好嘞,分类表数据有了,下面来商品的数据:
1 | function loadProduct({ |
2 | url, |
3 | name, |
4 | _id |
5 | }, isFirst = true) { |
6 | fetchData(url).then(res => { |
7 | const $ = cheerio.load(res.data.toString()); |
8 | const products = []; |
9 | if (isFirst) { |
10 | const $tagProducts = $(".proList li"); |
11 | $tagProducts.each(function() { |
12 | const product = {}; |
13 | product.title = $(this) |
14 | .find('.srchlst-wrap .bd .nptt a') |
15 | .text(); |
16 | product.imgUrl = $(this) |
17 | .find(".srchlst-wrap .goods-desc .collect") |
18 | .attr("src"); |
19 | product.CurrentPrice = $(this) |
20 | .find(".srchlst-wrap .bd .price_sc em") |
21 | .eq(2).attr("price"); |
22 | product.OriginPrice = $(this) |
23 | .find(".srchlst-wrap .bd .origin-price i") |
24 | .text(); |
25 | product.detailUrl = $(this) |
26 | .find(".srchlst-wrap .goods-head a") |
27 | .attr("href"); |
28 | product.category = name; |
29 | product.id = _id; |
30 | products.push(product); |
31 | }); |
32 | } else { |
33 | const $tagProducts = $(".proList li"); |
34 | $tagProducts.each(function(index) { |
35 | const product = {}; |
36 | product.title = $(this) |
37 | .find('.srchlst-wrap .bd .nptt a') |
38 | .text(); |
39 | product.imgUrl = $(this) |
40 | .find(".srchlst-wrap .goods-desc .collect") |
41 | .attr("src"); |
42 | product.CurrentPrice = $(this) |
43 | .find(".srchlst-wrap .bd .price_sc em") |
44 | .eq(2).attr("price"); |
45 | product.OriginPrice = $(this) |
46 | .find(".srchlst-wrap .bd .origin-price i") |
47 | .text(); |
48 | product.detailUrl = $(this) |
49 | .find(".srchlst-wrap .goods-head a") |
50 | .attr("href"); |
51 | product.category = name; |
52 | product.id = _id; |
53 | products.push(product); |
54 | }); |
55 | } |
56 | Product.insertMany(products) |
57 | .then(res => { |
58 | for (var i = 0; i < res.length; i++) { |
59 | loadDt(res[i]); |
60 | } |
61 | }); |
62 | |
63 | }); |
64 | } |
紧接着,来一些商品详情把~
1 | function loadDt({ detailUrl, name, _id, category }) { |
2 | fetchData(detailUrl) |
3 | .then(res => { |
4 | const $ = cheerio.load(res.data.toString()); |
5 | const proDetails = []; |
6 | const $goodsDetails = $("#goodsContainer"); |
7 | $goodsDetails.each(function() { |
8 | let proDetail = {}; |
9 | proDetail.imgUrl = $(this) |
10 | .find("#goodsImg0 .goodsPic img") |
11 | .attr("src"); |
12 | proDetail.imgUrlall = [ |
13 | $(this) |
14 | .find("#goodsImg0 .goodsPic img") |
15 | .attr("src"), |
16 | $(this) |
17 | .find("#spec-list .list-h li .picSmallClass2") |
18 | .attr("src"), |
19 | $(this) |
20 | .find("#spec-list .list-h li .picSmallClass3") |
21 | .attr("src"), |
22 | $(this) |
23 | .find("#spec-list .list-h li .picSmallClass4") |
24 | .attr("src"), |
25 | $(this) |
26 | .find("#spec-list .list-h li .picSmallClass5") |
27 | .attr("src"), |
28 | $(this) |
29 | .find("#spec-list .list-h li .picSmallClass6") |
30 | .attr("src") |
31 | ]; |
32 | proDetail.title = $(this) |
33 | .find(".shopping-container h1") |
34 | .text(); |
35 | proDetail.OriginPrice = $(this) |
36 | .find(".shopping-container .good_ygprcarea #ygprice_area del") |
37 | .text(); |
38 | proDetail.title = $(this) |
39 | .find(".shopping-container h1") |
40 | .text(); |
41 | proDetail.OriginPrice = $(this) |
42 | .find(".shopping-container .good_ygprcarea #ygprice_area del") |
43 | .text(); |
44 | proDetail.sizeAll = [ |
45 | $(this) |
46 | .find(".size .prosize .prodSpec a") |
47 | .eq(0).attr("data-name"), |
48 | $(this) |
49 | .find(".size .prosize .prodSpec a") |
50 | .eq(1).attr("data-name"), |
51 | $(this) |
52 | .find(".size .prosize .prodSpec a") |
53 | .eq(2).attr("data-name"), |
54 | ]; |
55 | |
56 | console.log(proDetail); |
57 | proDetail.id = _id; |
58 | proDetail.name = category; |
59 | proDetails.push(proDetail); |
60 | }); |
61 | Detail.insertMany(proDetails).then(detEnd => console.log("保存详情成功")); |
62 | }) |
63 | } |
OK , 够咱玩数据库和接口了,收手走起接口!~
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 仙女璇!