动态网页爬取样例（WebCollector+selenium+phantomjs）

目标：动态网页爬取

说明：这里的动态网页指几种可能：1）须要用户交互，如常见的登录操作；2）网页通过JS / AJAX动态生成。如一个html里有<div id="test"></div>，通过JS生成<div id="test"><span>aaa</span></div>。

这里用了WebCollector 2进行爬虫，这东东也方便，只是要支持动态关键还是要靠另外一个API -- selenium 2（集成htmlunit 和 phantomjs）.

1）须要登录后的爬取，如新浪微博

import java.util.Set;

import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;

import cn.edu.hfut.dmic.webcollector.model.Links;

import cn.edu.hfut.dmic.webcollector.model.Page;

import cn.edu.hfut.dmic.webcollector.net.HttpRequesterImpl;

import org.openqa.selenium.Cookie;

import org.openqa.selenium.WebElement;

import org.openqa.selenium.htmlunit.HtmlUnitDriver;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

/*

 * 登录后爬取

 * Refer: http://nutcher.org/topics/33

 * https://github.com/CrawlScript/WebCollector/blob/master/README.zh-cn.md

 * Lib required: webcollector-2.07-bin, selenium-java-2.44.0 & its lib

 */

public class WebCollector1 extends DeepCrawler {

	public WebCollector1(String crawlPath) {

		super(crawlPath);

		/*获取新浪微博的cookie，账号密码以明文形式传输。请使用小号*/

		try {

			String cookie=WebCollector1.WeiboCN.getSinaCookie("yourAccount", "yourPwd");

			HttpRequesterImpl myRequester=(HttpRequesterImpl) this.getHttpRequester();

	        myRequester.setCookie(cookie);

		} catch (Exception e) {

			e.printStackTrace();

		}

	}

	@Override

    public Links visitAndGetNextLinks(Page page) {

    	/*抽取微博*/

        Elements weibos=page.getDoc().select("div.c");

        for(Element weibo:weibos){

            System.out.println(weibo.text());

        }

        /*假设要爬取评论，这里能够抽取评论页面的URL。返回*/

        return null;

	}

	public static void main(String[] args) {

		WebCollector1 crawler=new WebCollector1("/home/hu/data/weibo");

        crawler.setThreads(3);

        /*对某人微博前5页进行爬取*/

        for(int i=0;i<5;i++){

            crawler.addSeed("http://weibo.cn/zhouhongyi?vt=4&page="+i);

        }

        try {

			crawler.start(1);

		} catch (Exception e) {

			e.printStackTrace();

		}

	}

	public static class WeiboCN {

	    /**

	     * 获取新浪微博的cookie。这种方法针对weibo.cn有效，对weibo.com无效

	     * weibo.cn以明文形式数据传输。请使用小号

	     * @param username 新浪微博用户名

	     * @param password 新浪微博密码

	     * @return

	     * @throws Exception

	     */

	    public static String getSinaCookie(String username, String password) throws Exception{

	        StringBuilder sb = new StringBuilder();

	        HtmlUnitDriver driver = new HtmlUnitDriver();

	        driver.setJavascriptEnabled(true);

	        driver.get("http://login.weibo.cn/login/");

	        WebElement mobile = driver.findElementByCssSelector("input[name=mobile]");

	        mobile.sendKeys(username);

	        WebElement pass = driver.findElementByCssSelector("input[name^=password]");

	        pass.sendKeys(password);

	        WebElement rem = driver.findElementByCssSelector("input[name=remember]");

	        rem.click();

	        WebElement submit = driver.findElementByCssSelector("input[name=submit]");

	        submit.click();

	        Set<Cookie> cookieSet = driver.manage().getCookies();

	        driver.close();

	        for (Cookie cookie : cookieSet) {

	            sb.append(cookie.getName()+"="+cookie.getValue()+";");

	        }

	        String result=sb.toString();

	        if(result.contains("gsid_CTandWM")){

	            return result;

	        }else{

	            throw new Exception("weibo login failed");

	        }

	    }

	}

}

* 这里有个自己定义路径/home/hu/data/weibo（WebCollector1 crawler=new WebCollector1("/home/hu/data/weibo");），是用来保存到嵌入式数据库Berkeley DB。

* 整体上来自Webcollector 作者的sample。

2）JS动态生成HTML元素的爬取

import java.util.List;

import org.openqa.selenium.By;

import org.openqa.selenium.WebDriver;

import org.openqa.selenium.WebElement;

import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;

import cn.edu.hfut.dmic.webcollector.model.Links;

import cn.edu.hfut.dmic.webcollector.model.Page;

/*

 * JS爬取

 * Refer: http://blog.csdn.net/smilings/article/details/7395509

 */

public class WebCollector3 extends DeepCrawler {

	public WebCollector3(String crawlPath) {

		super(crawlPath);

		// TODO Auto-generated constructor stub

	}

	@Override

	public Links visitAndGetNextLinks(Page page) {

		/*HtmlUnitDriver能够抽取JS生成的数据*/

//		HtmlUnitDriver driver=PageUtils.getDriver(page,BrowserVersion.CHROME);

//		String content = PageUtils.getPhantomJSDriver(page);

        WebDriver driver = PageUtils.getWebDriver(page);

//        List<WebElement> divInfos=driver.findElementsByCssSelector("#feed_content");

        List<WebElement> divInfos=driver.findElements(By.cssSelector("#feed_content span"));

        for(WebElement divInfo:divInfos){

            System.out.println("Text是：" + divInfo.getText());

        }

        return null;

	}

	public static void main(String[] args) {

		WebCollector3 crawler=new WebCollector3("/home/hu/data/wb");

        for(int page=1;page<=5;page++)

//        crawler.addSeed("http://www.sogou.com/web?query="+URLEncoder.encode("编程")+"&page="+page);

        crawler.addSeed("http://cq.qq.com/baoliao/detail.htm?294064");

        try {

			crawler.start(1);

		} catch (Exception e) {

			e.printStackTrace();

		}

	}

}

PageUtils.java

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import org.openqa.selenium.JavascriptExecutor;

import org.openqa.selenium.WebDriver;

import org.openqa.selenium.chrome.ChromeDriver;

import org.openqa.selenium.htmlunit.HtmlUnitDriver;

import org.openqa.selenium.ie.InternetExplorerDriver;

import org.openqa.selenium.phantomjs.PhantomJSDriver;

import com.gargoylesoftware.htmlunit.BrowserVersion;

import cn.edu.hfut.dmic.webcollector.model.Page;

public class PageUtils {

	public static HtmlUnitDriver getDriver(Page page) {

        HtmlUnitDriver driver = new HtmlUnitDriver();

        driver.setJavascriptEnabled(true);

        driver.get(page.getUrl());

        return driver;

    }

    public static HtmlUnitDriver getDriver(Page page, BrowserVersion browserVersion) {

        HtmlUnitDriver driver = new HtmlUnitDriver(browserVersion);

        driver.setJavascriptEnabled(true);

        driver.get(page.getUrl());

    	return driver;

    }

    public static WebDriver getWebDriver(Page page) {

//    	WebDriver driver = new HtmlUnitDriver(true);

//    	System.setProperty("webdriver.chrome.driver", "D:\\Installs\\Develop\\crawling\\chromedriver.exe");

//    	WebDriver driver = new ChromeDriver();

    	System.setProperty("phantomjs.binary.path", "D:\\Installs\\Develop\\crawling\\phantomjs-2.0.0-windows\\bin\\phantomjs.exe");

    	WebDriver driver = new PhantomJSDriver();

    	driver.get(page.getUrl());

//    	JavascriptExecutor js = (JavascriptExecutor) driver;

//    	js.executeScript("function(){}");

    	return driver;

    }

    public static String getPhantomJSDriver(Page page) {

    	Runtime rt = Runtime.getRuntime();

    	Process process = null;

    	try {

			process = rt.exec("D:\\Installs\\Develop\\crawling\\phantomjs-2.0.0-windows\\bin\\phantomjs.exe " +

			"D:\\workspace\\crawlTest1\\src\\crawlTest1\\parser.js " +

			page.getUrl().trim());

			InputStream in = process.getInputStream();

			InputStreamReader reader = new InputStreamReader(

					in, "UTF-8");

			BufferedReader br = new BufferedReader(reader);

			StringBuffer sbf = new StringBuffer();

			String tmp = "";

			while((tmp = br.readLine())!=null){

                sbf.append(tmp);

            }

			return sbf.toString();

		} catch (IOException e) {

			e.printStackTrace();

		}

    	return null;

    }

}

2.1）HtmlUnitDriver getDriver是selenium 1.x的作法，已经outdate了，如今用WebDriver getWebDriver

2.2）这里用了几种方法：HtmlUnitDriver, ChromeDriver, PhantomJSDriver, PhantomJS，參考 http://blog.csdn.net/five3/article/details/19085303。各自之间的优缺点例如以下：

driver类型	长处	缺点	应用
真实浏览器driver	真实模拟用户行为	效率、稳定性低	兼容性測试
HtmlUnit	速度快	js引擎不是主流的浏览器支持的	包括少量js的页面測试
PhantomJS	速度中等、模拟行为接近真实	不能模拟不同/特定浏览器的行为	非GUI的功能性測试

* 真实浏览器driver 包含 Firefox, Chrome, IE

2.3）用PhantomJSDriver的时候，遇上错误：ClassNotFoundException: org.openqa.selenium.browserlaunchers.Proxies，原因居然是selenium 2.44 的bug。后来通过maven找到phantomjsdriver-1.2.1.jar 才攻克了。

2.4）另外。我还试了PhantomJS 原生调用（也就是不用selenium，直接调用PhantomJS。见上面的方法）。原生要调用JS，这里的parser.js代码例如以下：

system = require('system')

address = system.args[1];//获得命令行第二个參数 接下来会用到

//console.log('Loading a web page');

var page = require('webpage').create();

var url = address;

//console.log(url);

page.open(url, function (status) {

    //Page is loaded!

    if (status !== 'success') {

        console.log('Unable to post!');

    } else {

    //此处的打印，是将结果一流的形式output到java中，java通过InputStream能够获取该输出内容

        console.log(page.content);

    }

    phantom.exit();

});

3）后话

3.1）HtmlUnitDriver + PhantomJSDriver是当前最可靠的动态抓取方案。

3.2）这过程中用到非常多包、exe，遇到非常多的墙~，有须要的朋友能够找我要。

Reference

http://www.ibm.com/developerworks/cn/web/1309_fengyq_seleniumvswebdriver/

http://blog.csdn.net/smilings/article/details/7395509

http://phantomjs.org/download.html

http://blog.csdn.net/five3/article/details/19085303

http://phantomjs.org/quick-start.html

... ...

动态网页爬取样例（WebCollector+selenium+phantomjs）的更多相关文章

动态网页爬取例子（WebCollector+selenium+phantomjs）
目标:动态网页爬取说明:这里的动态网页指几种可能:1)需要用户交互,如常见的登录操作:2)网页通过JS / AJAX动态生成,如一个html里有<div id="test" ...
Node.js 动态网页爬取 PhantomJS 使用入门(转)
Node.js 动态网页爬取 PhantomJS 使用入门原创NeverSettle101 发布于2017-03-24 09:34:45 阅读数 8309 收藏展开版权声明:本文为 winte ...
python动态网页爬取——四六级成绩批量爬取
需求: 四六级成绩查询网站我所知道的有两个:学信网(http://www.chsi.com.cn/cet/)和99宿舍(http://cet.99sushe.com/),这两个网站采用的都是动态网页. ...
为采集动态网页安装和测试Python Selenium库
1. 引言上一篇<为编写网络爬虫程序安装Python3.5>中测试小例子对静态网页做了一个简单的采集程序,而动态网页因为需要动态加载js获取数据,所以使用urllib直接openurl已经 ...
爬虫入门（三）——动态网页爬取：爬取pexel上的图片
Pexel上有大量精美的图片,没事总想看看有什么好看的自己保存到电脑里可能会很有用但是一个一个保存当然太麻烦了所以不如我们写个爬虫吧(๑•̀ㅂ•́)و✧ 一开始学习爬虫的时候希望爬取pexel上的 ...
python 爬虫proxy,BeautifulSoup+requests+mysql 爬取样例
实现思路: 由于反扒机制,所以需要做代理切换,去爬取,内容通过BeautifulSoup去解析,最后入mysql库 1.在西刺免费代理网获取代理ip,并自我检测是否可用 2.根据获取的可用代理ip去发 ...
python+selenium+PhantomJS爬取网页动态加载内容
一般我们使用python的第三方库requests及框架scrapy来爬取网上的资源,但是设计javascript渲染的页面却不能抓取,此时,我们使用web自动化测试化工具Selenium+无界面浏览 ...
Python开发爬虫之动态网页抓取篇：爬取博客评论数据——通过Selenium模拟浏览器抓取
区别于上篇动态网页抓取,这里介绍另一种方法,即使用浏览器渲染引擎.直接用浏览器在显示网页时解析 HTML.应用 CSS 样式并执行 JavaScript 的语句. 这个方法在爬虫过程中会打开一个浏览器 ...
基于selenium+phantomJS的动态网站全站爬取
由于需要在公司的内网进行神经网络建模试验(https://www.cnblogs.com/NosenLiu/articles/9463886.html),为了更方便的在内网环境下快速的查阅资料,构建深 ...

随机推荐

（WC2016模拟十一）【BZOJ4695】最假女选手
ps:好久没更博啦……这几天连着有模拟赛,等初赛前后休息的时候来疯狂补坑吧……顺便补一下前面的数论啥的? 题解: mdzz我场上写了个15分暴力长度跟标算差不多... 线段树大法好啊!这题听说很多人做 ...
java 对象拆箱装箱编译和反编译的验证
创建对象 package 创建对象的个数; public class main { public static void main(String[] agrs){ Check c1=new Check ...
CAD二次开发（02）-添加对象到模型空间
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.T ...
Json学习总结（2）——Java 下的 JSON库性能比较：JSON.simple vs. GSON vs. Jackson vs. JSONP
JSON已经成为当前服务器与WEB应用之间数据传输的公认标准,不过正如许多我们所习以为常的事情一样,你会觉得这是理所当然的便不再深入思考了.我们很少会去想用到的这些JSON库到底有什么不同,但事实上它 ...
洛谷 P1572 计算分数
P1572 计算分数题目描述 Csh被老妈关在家里做分数计算题,但显然他不愿意坐这么多复杂的计算.况且在家门口还有Xxq在等着他去一起看电影.为了尽快地能去陪Xxq看电影,他把剩下的计算题交给了你, ...
Android新手入门2016（14）--FragmentTabHost实现选项卡和菜单
本文来自肥宝传说之路,引用必须注明出处! 这章憋了好久.本来想写选项卡的,学到TabHost,TabWidget的,把代码拿过来准备研究的时候,发现竟然在4.0.3版本号被废弃了. 百度一下,发如今后 ...
zoj 3820 Building Fire Stations （二分+树的直径）
Building Fire Stations Time Limit: 5 Seconds Memory Limit: 131072 KB Special Judge Marjar ...
rest_framework 解析器（下全局配置使用）
解析器一般都是全局设置参考文档 www.cnblogs.com/wupeiqi/articles/.html REST_FRAMEWORK=( "DEFAULT_PARSER_CLASS ...
修改host方法
打开路径 C:\Windows\System32\drivers\etc 将hosts文件拷贝出来修改之后放回去覆盖即可以下是一个例子,想得到ip可以先ping一下那个域名. 左边是ip,右边是域名 ...
Bayes++ Library入门学习之熟悉class-Importance_resampler
接下来,需要介绍的是重要性重采样类Bayesian_filter::Improtance_resampler.该类实现了两种重采样方法[1][2],和其子类的继承关系图如下: 其中Standard_r ...

动态网页爬取样例（WebCollector+selenium+phantomjs）

动态网页爬取样例（WebCollector+selenium+phantomjs）的更多相关文章

随机推荐

热门专题