Java WebClient 总结
private WebClient getAWebClient() {
WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24);
webClient.getOptions().setTimeout(20000);
// webClient.getCookieManager().setCookiesEnabled(true);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false);
webClient.addRequestHeader("Accept", "textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
webClient.addRequestHeader("Accept-Encoding", "gzip, deflate");
webClient.addRequestHeader("Accept-Language", "en-US,en;q=0.5");
webClient.addRequestHeader("Cache-Control", "max-age=0");
webClient.addRequestHeader("Connection", "keep-alive");
webClient.addRequestHeader("Host", "www.amazon.com");
webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0");
return webClient;
}
/**
* 采集网页
*/
public StringBuilder crawlPage(String url) {
StringBuilder builder = new StringBuilder();
logger.info(Thread.currentThread().getName() + " crawl " + url);
// mygetpage代码放在这里
webClient.getCookieManager().clearCookies();
logger.info(Thread.currentThread().getName() + " webClient.getCookieManager().clearCookies();");
File file = new File(cookiePathAppendRandom());
logger.info(Thread.currentThread().getName() + " File file = new File(cookiePathAppendRandom());");
if (file.exists()) {
FileInputStream fin = null;
try {
fin = new FileInputStream(file);
} catch (FileNotFoundException e1) {
e1.printStackTrace();
}
CookieStore cookieStore = null;
ObjectInputStream in;
try {
in = new ObjectInputStream(fin);
cookieStore = (CookieStore) in.readObject();
in.close();
} catch (IOException e) {
logger.error(e);
} catch (ClassNotFoundException e) {
logger.error(e);
}
List<org.apache.http.cookie.Cookie> l = cookieStore.getCookies();
for (org.apache.http.cookie.Cookie temp : l) {
Cookie cookie = new Cookie(temp.getDomain(), temp.getName(), temp.getValue(), temp.getPath(),
temp.getExpiryDate(), false);
webClient.getCookieManager().addCookie(cookie);
}
}
logger.info(Thread.currentThread().getName() + " MyGetPage start,url:" + url);
HtmlPage page = MyGetPage(new StringBuffer(url));
logger.info(Thread.currentThread().getName() + " MyGetPage end,url:" + url);
if (page == null) {
// 采集过程中出现异常的model,可以统一放在一个list中,发送给server重新加入到采集分配队列
logger.info("Page null!");
AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
exceptionFun(model);
return (new StringBuilder("getNullPage"));
}
logger.info(Thread.currentThread().getName() + " builder.append(page.asXml());");
builder.append(page.asXml());
logger.info(Thread.currentThread().getName() + " return builder;");
logger.info(Thread.currentThread().getName() +" CrawlPage $Length="+builder.toString().length());
if(builder.toString().length()<=300){
AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
exceptionFun(model);
return (new StringBuilder("getNullPage"));
}
return builder;
}
/***
* 自定义的getpage,遇到验证码页面识别直至成功
*
*/
private HtmlPage MyGetPage(StringBuffer URL) {
HtmlPage page = null;
boolean flag = true;
int TryTimeCnt = 1;
int UnknowHostTryTimeCnt = 1;
while (flag) {
flag = false;
try {
logger.info(Thread.currentThread().getName() + " webClient.getPage : " + URL + ",CrawlURL_id:"
+ crawlURLId);
page = webClient.getPage(URL.toString());
Document doc = Jsoup.parse(page.asXml());
int robotchecknum = 1;
while (doc.select("title").text().equals("Robot Check")) {
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [Robot Check,URL:" + URL + "]");
String captcha_str = AmazonGetCaptcha.GetCaptcha(new StringBuilder(doc.toString()));
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " end AmazonGetCaptcha.GetCaptcha");
logger.info(dayformat1.format(new Date()) + " " + Thread.currentThread().getName() + " : "
+ captcha_str); HtmlForm form = null; logger.info(Thread.currentThread().getName() + " page.getForms().get(0) Start");
form = page.getForms().get(0);
logger.info(Thread.currentThread().getName() + " page.getForms().get(0) End"); HtmlButton button = null; logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) Start");
button = (HtmlButton) form.getElementsByTagName("button").get(0);
logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) End"); logger.info(Thread.currentThread().getName() + " setValueAttribute Start");
form.getInputByName("field-keywords").setValueAttribute(captcha_str);
logger.info(Thread.currentThread().getName() + " setValueAttribute End"); logger.info(Thread.currentThread().getName() + " button.click Start");
boolean click_flag = false;
while (!click_flag) {
try {
click_flag = true;
page = button.click();
} catch (Exception e1) {
logger.error(Thread.currentThread().getName() + " button.click出错了: " + e1);
//e1.printStackTrace();
click_flag = false;
}
}
logger.info(Thread.currentThread().getName() + " button.click end");
while (page.asXml() == null) {
logger.info(Thread.currentThread().getName() + " page xml null");
logger.info(Thread.currentThread().getName() +" "+ page.asXml());
page.refresh();
logger.info(Thread.currentThread().getName() + " refresh End!");
}
logger.info(Thread.currentThread().getName() + " button.click End"); logger.info(Thread.currentThread().getName() + " Start ParsePage!");
doc = Jsoup.parse(page.asXml());
if (!doc.select("title").text().equals("Robot Check")) {
logger.info(Thread.currentThread().getName() + " " + doc.select("title").text());
logger.info(Thread.currentThread().getName() + " "
+ dayformat1.format(System.currentTimeMillis()) + " [Robot Check,captcha success:"
+ captcha_str + ",try num:" + robotchecknum + "]");
}
robotchecknum++;
} } catch (FailingHttpStatusCodeException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
} catch (MalformedURLException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
}catch(UnknownHostException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
logger.info("found UnknownHostException,start sleep 20 min");
try {
Thread.sleep(1000*60*Integer.parseInt(Configuration.getProperties("unknowhost_sleeptime")));
} catch (InterruptedException e1) {
logger.error(Thread.currentThread().getName() +" "+ e1);
}
logger.info("found UnknownHostException,end sleep 20 min");
UnknowHostTryTimeCnt++;// 访问异常数加一
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [UnknowHostTryTimeCnt:" + UnknowHostTryTimeCnt + "]");
if (UnknowHostTryTimeCnt > Integer.parseInt(Configuration.getProperties("unknowhost_maxtrytime"))) {
return null;
}
}catch (Exception eq) {
logger.error(Thread.currentThread().getName() + " "+eq);
TryTimeCnt++;// 访问异常数加一
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [TryTimeCnt:" + TryTimeCnt + "]");
if (TryTimeCnt > 5) {
return null;
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
logger.error(Thread.currentThread().getName() + e);
}
flag = true;
}
try {
Thread.sleep(random.nextInt(500) + 1500);
} catch (InterruptedException e) {
logger.error(Thread.currentThread().getName() + e);
flag = true;
}
}
return page;
}
Java WebClient 总结的更多相关文章
- Spark案例分析
一.需求:计算网页访问量前三名 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /* ...
- C#调用JAVA接口WSSE方式用WebClient方式
C#读取JAVA的WSSE接口的调用代码: 用webclient 方式: /// <summary> /// 调用java cxf ws_security加密的服务wcf客户端对应的加密类 ...
- websocket通信 实现java模拟一个client与webclient通信
发文原由: 熟悉socket通信的同学,对于socket模拟server与client,实现相互通信, 或者使用websocket与java模拟的websocket服务器通信(比如一个聊天室),对于这 ...
- Java调用Http/Https接口(7,end)--WebClient调用Http/Https接口
WebClient是Spring提供的非阻塞.响应式的Http客户端,提供同步及异步的API,将会代替RestTemplate及AsyncRestTemplate.文中所使用到的软件版本:Java 1 ...
- java.lang.UnsupportedClassVersionError: com/gargoylesoftware/htmlunit/WebClient : Unsupported major.minor version 52.0 (unable to load class com.gargoylesoftware.htmlunit.WebClient)
java.lang.UnsupportedClassVersionError: com/gargoylesoftware/htmlunit/WebClient : Unsupported major. ...
- htmlunit学习之java.lang.NoSuchMethodError: com.gargoylesoftware.htmlunit.WebClient.getOptions()Lcom/gargoylesoftware/htmlunit/WebClientOptions;
运行到这里就报错 java.lang.NoSuchMethodError: com.gargoylesoftware.htmlunit.WebClient.getOptions()Lcom/gargo ...
- webClient请求JAVA超时解决方案
private class MyWebClient: WebClient { protected override WebRequest GetWebRequest(Uri uri) { WebReq ...
- C#、JAVA操作Hadoop(HDFS、Map/Reduce)真实过程概述。组件、源码下载。无法解决:Response status code does not indicate success: 500。
一.Hadoop环境配置概述 三台虚拟机,操作系统为:Ubuntu 16.04. Hadoop版本:2.7.2 NameNode:192.168.72.132 DataNode:192.168.72. ...
- Atitit.http httpclient实践java c# .net php attilax总结
Atitit.http httpclient实践java c# .net php attilax总结 1. Navtree>> net .http1 2. Httpclient理论1 2. ...
随机推荐
- 使用ASP.NET Web Api构建基于REST风格的服务实战系列教程【九】——API变了,客户端怎么办?
系列导航地址http://www.cnblogs.com/fzrain/p/3490137.html 前言 一旦我们将API发布之后,消费者就会开始使用并和其他的一些数据混在一起.然而,当新的需求出现 ...
- 斗鱼的sidebar的实现简陋的demo
斗鱼的sidebar的实现简陋的demo <!DOCTYPE html> <html> <head lang="en"> <meta ch ...
- authorization配置
在 Web.config 文件的<configuration>标记的子标记<authorization>和</authorization>之间用于设置应用程序的授权 ...
- Oracle nvl(),nvl2()函数介绍
NVL函数 Oracle/PLSQL中的一个函数. 格式为: NVL( string1, replace_with) 功能:如果string1为NULL,则NVL函数返回replace_with的值, ...
- Junit使用
eclipse Junit的简单使用: eclipse自带了Junit插件. 安装Junit,如下图所示,右键项目-->Properties-->Add Library 选择Junit 选 ...
- ajax实例详解
页面通过ajax和后台进行数据交互是非常简洁且方便的.特别是封装成json数据格式. 此处使用的是jQuery的ajax var params = { version:new Date().getTi ...
- Eclipse常用开发插件
以下是我整理的自己开发过程中的常用Eclipse插件,按字母排序: (1) AmaterasUML 介绍:Eclipse的UML插件,支持UML活动图,class图,sequen ...
- Hadoop伪分布式搭建(一)
下面内容主要说明在Windows虚拟机上面,怎么搭建一个Hadoop伪分布式,并如何运行wordcount程序和网页查看HDFS文件系统. 1 相关软件下载和安装 APACH官网提供hadoop版本 ...
- display : -webkit-box-inline 我见
发现: 最近在做移动端的东西,说起移动端弹性盒子布局真是无往不利,用起来特别爽,我也是偶尔间发现的这个属性并且它的用法,在网上基本查不到这个属性的资料(个人看法).如果没有听说过(display:bo ...
- C++你不知道的那些事儿—C++语言的15个晦涩特性
这个列表收集了 C++ 语言的一些晦涩(Obscure)特性,是我经年累月研究这门语言的各个方面收集起来的.C++非常庞大,我总是能学到一些新知识.即使你对C++已了如指掌,也希望你能从列表中学到一些 ...