nodejs模块 – phantomjs的简单爬虫应用实例

PhantomJS是一个可以用JavaScript编写脚本的无头web浏览器。它可以在Windows、macOS、Linux和FreeBSD上运行。
它使用QtWebKit作为后端，为各种web标准(DOM处理、CSS选择器、JSON、Canvas和SVG)提供快速和本机支持。

安装phantomjs

npm i phantom --save-dev

0	npm i phantom --save-dev

安装Cheerio, Cheerio是一个服务器端基本实现Jquery核心功能的一个库

npm i cheerio --save-dev

0	npm i cheerio --save-dev

实例一（这是官方首页给的一个实例）：

下面这个用于PhantomJS的简单脚本加载百度主页，稍等片刻，然后将其捕获到一个图像中。

var page = require('webpage').create();
page.open('http://www.baidu.com', function() {
    setTimeout(function() {
        page.render('google.png');
        phantom.exit();
    }, 200);
});

var page = require('webpage').create();

page.open('http://www.baidu.com', function() {

setTimeout(function() {

page.render('google.png');

phantom.exit();

}, 200);

});

实例二：

使用WebStorm创建一个NodeJS-Express项目，项目创建好以后，修改项目的index.js文件。

目的：获取百度首页上所有能匹配 http[s]?://.* 的a标签，因此有：

const express = require('express');
const router = express.Router();
const phantom = require('phantom');
const cheerio = require('cheerio');

/* GET home page. */
router.get('/', function (req, res, next) {
    res.header('Content-Type', 'application/json');
    let sitepage = null; //创建网页对象实例
    let phInstance = null; //创建phantomj实例对象
    phantom.create()
        .then(instance => {
            phInstance = instance;
            return instance.createPage();
        })
        .then(page => {
            sitepage = page;
            return page.open('https://www.baidu.com/');
        })
        .then(status => {
            console.info(status); //获取结果状态
            return sitepage.property('content'); //获取相应的属性内容
        })
        .then(content => { 
            const $ = cheerio.load(content);  //解析输出的结果内容
            const jsonResult = [];
            $('a[href]').each((i, item) => {  //抓取符合条件的a标签的链接地址
                const href = $(item).attr('href');
                if (new RegExp(/http[s]?:\/\/.*/).test(href)) {
                    jsonResult.push(href);
                }
            });
            sitepage.close();
            phInstance.exit();
            res.json(jsonResult);
        })
        .catch(error => {
            console.log(error);
            phInstance.exit();
            res.json({status: false});
        });
});

module.exports = router;

const express = require('express');

const router = express.Router();

const phantom = require('phantom');

const cheerio = require('cheerio');

/* GET home page. */

router.get('/', function (req, res, next) {

res.header('Content-Type', 'application/json');

let sitepage = null; //创建网页对象实例

let phInstance = null; //创建phantomj实例对象

phantom.create()

.then(instance => {

phInstance = instance;

return instance.createPage();

})

.then(page => {

sitepage = page;

return page.open('https://www.baidu.com/');

})

.then(status => {

console.info(status); //获取结果状态

return sitepage.property('content'); //获取相应的属性内容

})

.then(content => {

const $ = cheerio.load(content); //解析输出的结果内容

const jsonResult = [];

$('a[href]').each((i, item) => { //抓取符合条件的a标签的链接地址

const href = $(item).attr('href');

if (new RegExp(/http[s]?:\/\/.*/).test(href)) {

jsonResult.push(href);

}

});

sitepage.close();

phInstance.exit();

res.json(jsonResult);

})

.catch(error => {

console.log(error);

phInstance.exit();

res.json({status: false});

});

module.exports = router;

运行以上代码后，页面中输出链接集合。

——————————

PhantomJS官网

1、页面自动化

使用标准的DOM API或jQuery等常用库访问web页面并提取信息，玩爬虫的，就好好研究这个了。

2、屏幕截图

以编程方式捕获web内容，包括SVG和Canvas，创建网站截图与缩略图预览。

3、无头网站测试

使用诸如Jasmine、QUnit、Mocha、WebDriver等框架运行功能测试。

4、网络监控

监控页面加载和导出为标准的HAR文件。使用YSlow和Jenkins自动化性能分析。

织梦先生

一个从零开始并且为了自己心底的理想不断奋斗中的人

nodejs模块 – phantomjs的简单爬虫应用实例

实例一（这是官方首页给的一个实例）：

实例二：