private WebClient getAWebClient() {
WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24);
webClient.getOptions().setTimeout(20000);
// webClient.getCookieManager().setCookiesEnabled(true);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false);
webClient.addRequestHeader("Accept", "textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
webClient.addRequestHeader("Accept-Encoding", "gzip, deflate");
webClient.addRequestHeader("Accept-Language", "en-US,en;q=0.5");
webClient.addRequestHeader("Cache-Control", "max-age=0");
webClient.addRequestHeader("Connection", "keep-alive");
webClient.addRequestHeader("Host", "www.amazon.com");
webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0");
return webClient;
}
/**
* 采集网页
*/
public StringBuilder crawlPage(String url) {
StringBuilder builder = new StringBuilder();
logger.info(Thread.currentThread().getName() + " crawl " + url);
// mygetpage代码放在这里
webClient.getCookieManager().clearCookies();
logger.info(Thread.currentThread().getName() + " webClient.getCookieManager().clearCookies();");
File file = new File(cookiePathAppendRandom());
logger.info(Thread.currentThread().getName() + " File file = new File(cookiePathAppendRandom());");
if (file.exists()) {
FileInputStream fin = null;
try {
fin = new FileInputStream(file);
} catch (FileNotFoundException e1) {
e1.printStackTrace();
}
CookieStore cookieStore = null;
ObjectInputStream in;
try {
in = new ObjectInputStream(fin);
cookieStore = (CookieStore) in.readObject();
in.close();
} catch (IOException e) {
logger.error(e);
} catch (ClassNotFoundException e) {
logger.error(e);
}
List<org.apache.http.cookie.Cookie> l = cookieStore.getCookies();
for (org.apache.http.cookie.Cookie temp : l) {
Cookie cookie = new Cookie(temp.getDomain(), temp.getName(), temp.getValue(), temp.getPath(),
temp.getExpiryDate(), false);
webClient.getCookieManager().addCookie(cookie);
}
}
logger.info(Thread.currentThread().getName() + " MyGetPage start,url:" + url);
HtmlPage page = MyGetPage(new StringBuffer(url));
logger.info(Thread.currentThread().getName() + " MyGetPage end,url:" + url);
if (page == null) {
// 采集过程中出现异常的model,可以统一放在一个list中,发送给server重新加入到采集分配队列
logger.info("Page null!");
AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
exceptionFun(model);
return (new StringBuilder("getNullPage"));
}
logger.info(Thread.currentThread().getName() + " builder.append(page.asXml());");
builder.append(page.asXml());
logger.info(Thread.currentThread().getName() + " return builder;");
logger.info(Thread.currentThread().getName() +" CrawlPage $Length="+builder.toString().length());
if(builder.toString().length()<=300){
AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
exceptionFun(model);
return (new StringBuilder("getNullPage"));
}
return builder;
}
/***
* 自定义的getpage,遇到验证码页面识别直至成功
*
*/
private HtmlPage MyGetPage(StringBuffer URL) {
HtmlPage page = null;
boolean flag = true;
int TryTimeCnt = 1;
int UnknowHostTryTimeCnt = 1;
while (flag) {
flag = false;
try {
logger.info(Thread.currentThread().getName() + " webClient.getPage : " + URL + ",CrawlURL_id:"
+ crawlURLId);
page = webClient.getPage(URL.toString());
Document doc = Jsoup.parse(page.asXml());
int robotchecknum = 1;
while (doc.select("title").text().equals("Robot Check")) {
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [Robot Check,URL:" + URL + "]");
String captcha_str = AmazonGetCaptcha.GetCaptcha(new StringBuilder(doc.toString()));
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " end AmazonGetCaptcha.GetCaptcha");
logger.info(dayformat1.format(new Date()) + " " + Thread.currentThread().getName() + " : "
+ captcha_str); HtmlForm form = null; logger.info(Thread.currentThread().getName() + " page.getForms().get(0) Start");
form = page.getForms().get(0);
logger.info(Thread.currentThread().getName() + " page.getForms().get(0) End"); HtmlButton button = null; logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) Start");
button = (HtmlButton) form.getElementsByTagName("button").get(0);
logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) End"); logger.info(Thread.currentThread().getName() + " setValueAttribute Start");
form.getInputByName("field-keywords").setValueAttribute(captcha_str);
logger.info(Thread.currentThread().getName() + " setValueAttribute End"); logger.info(Thread.currentThread().getName() + " button.click Start");
boolean click_flag = false;
while (!click_flag) {
try {
click_flag = true;
page = button.click();
} catch (Exception e1) {
logger.error(Thread.currentThread().getName() + " button.click出错了: " + e1);
//e1.printStackTrace();
click_flag = false;
}
}
logger.info(Thread.currentThread().getName() + " button.click end");
while (page.asXml() == null) {
logger.info(Thread.currentThread().getName() + " page xml null");
logger.info(Thread.currentThread().getName() +" "+ page.asXml());
page.refresh();
logger.info(Thread.currentThread().getName() + " refresh End!");
}
logger.info(Thread.currentThread().getName() + " button.click End"); logger.info(Thread.currentThread().getName() + " Start ParsePage!");
doc = Jsoup.parse(page.asXml());
if (!doc.select("title").text().equals("Robot Check")) {
logger.info(Thread.currentThread().getName() + " " + doc.select("title").text());
logger.info(Thread.currentThread().getName() + " "
+ dayformat1.format(System.currentTimeMillis()) + " [Robot Check,captcha success:"
+ captcha_str + ",try num:" + robotchecknum + "]");
}
robotchecknum++;
} } catch (FailingHttpStatusCodeException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
} catch (MalformedURLException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
}catch(UnknownHostException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
logger.info("found UnknownHostException,start sleep 20 min");
try {
Thread.sleep(1000*60*Integer.parseInt(Configuration.getProperties("unknowhost_sleeptime")));
} catch (InterruptedException e1) {
logger.error(Thread.currentThread().getName() +" "+ e1);
}
logger.info("found UnknownHostException,end sleep 20 min");
UnknowHostTryTimeCnt++;// 访问异常数加一
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [UnknowHostTryTimeCnt:" + UnknowHostTryTimeCnt + "]");
if (UnknowHostTryTimeCnt > Integer.parseInt(Configuration.getProperties("unknowhost_maxtrytime"))) {
return null;
}
}catch (Exception eq) {
logger.error(Thread.currentThread().getName() + " "+eq);
TryTimeCnt++;// 访问异常数加一
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [TryTimeCnt:" + TryTimeCnt + "]");
if (TryTimeCnt > 5) {
return null;
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
logger.error(Thread.currentThread().getName() + e);
}
flag = true;
}
try {
Thread.sleep(random.nextInt(500) + 1500);
} catch (InterruptedException e) {
logger.error(Thread.currentThread().getName() + e);
flag = true;
}
}
return page;
}

Java WebClient 总结的更多相关文章

  1. Spark案例分析

    一.需求:计算网页访问量前三名 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /* ...

  2. C#调用JAVA接口WSSE方式用WebClient方式

    C#读取JAVA的WSSE接口的调用代码: 用webclient 方式: /// <summary> /// 调用java cxf ws_security加密的服务wcf客户端对应的加密类 ...

  3. websocket通信 实现java模拟一个client与webclient通信

    发文原由: 熟悉socket通信的同学,对于socket模拟server与client,实现相互通信, 或者使用websocket与java模拟的websocket服务器通信(比如一个聊天室),对于这 ...

  4. Java调用Http/Https接口(7,end)--WebClient调用Http/Https接口

    WebClient是Spring提供的非阻塞.响应式的Http客户端,提供同步及异步的API,将会代替RestTemplate及AsyncRestTemplate.文中所使用到的软件版本:Java 1 ...

  5. java.lang.UnsupportedClassVersionError: com/gargoylesoftware/htmlunit/WebClient : Unsupported major.minor version 52.0 (unable to load class com.gargoylesoftware.htmlunit.WebClient)

    java.lang.UnsupportedClassVersionError: com/gargoylesoftware/htmlunit/WebClient : Unsupported major. ...

  6. htmlunit学习之java.lang.NoSuchMethodError: com.gargoylesoftware.htmlunit.WebClient.getOptions()Lcom/gargoylesoftware/htmlunit/WebClientOptions;

    运行到这里就报错 java.lang.NoSuchMethodError: com.gargoylesoftware.htmlunit.WebClient.getOptions()Lcom/gargo ...

  7. webClient请求JAVA超时解决方案

    private class MyWebClient: WebClient { protected override WebRequest GetWebRequest(Uri uri) { WebReq ...

  8. C#、JAVA操作Hadoop(HDFS、Map/Reduce)真实过程概述。组件、源码下载。无法解决:Response status code does not indicate success: 500。

    一.Hadoop环境配置概述 三台虚拟机,操作系统为:Ubuntu 16.04. Hadoop版本:2.7.2 NameNode:192.168.72.132 DataNode:192.168.72. ...

  9. Atitit.http httpclient实践java c# .net php attilax总结

    Atitit.http httpclient实践java c# .net php attilax总结 1. Navtree>> net .http1 2. Httpclient理论1 2. ...

随机推荐

  1. Log4j、slf4j

    1.Log4j 1.1 Log4j简介 Log4j有三个主要的组件:Loggers(记录器),Appenders (输出位置)和Layouts(布局).这里可简单理解为日志类别,日志要输出的地方和日志 ...

  2. 其他系统与ecshop的会员整合

    步骤一:整合两个的会员数据 用软件Navicat 的 "导入向导"功能,导入你的原数据类型(sql,mdb,db)我的是mdb类型.下一步选择你原有的会员字段“user”.再进行下 ...

  3. Google 谷歌网页搜索, 学术搜索

    Google 谷歌网页搜索, 学术搜索 1. 网页搜索引擎-Google * https://letsgg.tk/ * https://google.kfd.me/ 谷歌搜索镜像:  http://d ...

  4. JabRef 文献管理软件

    JabRef 文献管理软件简明教程 大多只有使用LaTeX撰写科技论文的研究人员才能完全领略到JabRef的妙不可言,但随着对Word写作平台上BibTeX4Word插件的开发和便利应用,使用Word ...

  5. Robot Framework--安装篇

    一.安装包 1.Python 2.robotframework 3.selenium 4.selenium2library 5.WxPython 6.安装RIDE 二.安装过程 1.安装python ...

  6. EF--Codefirst 加密数据库连接字符串

    http://www.tuicool.com/articles/QvYbEn 一.EF,CodeFirst加密SQL连接符 public LifeHelpContext() : base(" ...

  7. Yii2的深入学习--自动加载机制(转)

    Yii2 的自动加载分两部分,一部分是 Composer 的自动加载机制,另一部分是 Yii2 框架自身的自动加载机制. Composer自动加载 对于库的自动加载信息,Composer 生成了一个  ...

  8. php-fpm进程关闭与重启脚本详解(转)

    先来理解一下什么是php-fpm PHP-FPM是一个PHP FastCGI管理器,是只用于PHP的. PHP-FPM其实是PHP源代码的一个补丁,旨在将FastCGI进程管理整合进PHP包中.必须将 ...

  9. C#中返回值封装

    在平时开发过程中常常需要取一个方法的返回值,BOSS写了一个返回值类,做个练习以备不时之需: 返回值支持泛型和非泛型 先贴上代码: 非泛型返回值类: using System; using Syste ...

  10. JAVA语言学习笔记(一)

    1 一切都是对象 JAVA中所有代码都必须写在类里面. 方法名和参数列表(它们合起来被称为"方法签名")唯一地标识出某个方法.联想多态. 基本数据类型的"局部变量&quo ...