private WebClient getAWebClient() {
WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24);
webClient.getOptions().setTimeout(20000);
// webClient.getCookieManager().setCookiesEnabled(true);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false);
webClient.addRequestHeader("Accept", "textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
webClient.addRequestHeader("Accept-Encoding", "gzip, deflate");
webClient.addRequestHeader("Accept-Language", "en-US,en;q=0.5");
webClient.addRequestHeader("Cache-Control", "max-age=0");
webClient.addRequestHeader("Connection", "keep-alive");
webClient.addRequestHeader("Host", "www.amazon.com");
webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0");
return webClient;
}
/**
* 采集网页
*/
public StringBuilder crawlPage(String url) {
StringBuilder builder = new StringBuilder();
logger.info(Thread.currentThread().getName() + " crawl " + url);
// mygetpage代码放在这里
webClient.getCookieManager().clearCookies();
logger.info(Thread.currentThread().getName() + " webClient.getCookieManager().clearCookies();");
File file = new File(cookiePathAppendRandom());
logger.info(Thread.currentThread().getName() + " File file = new File(cookiePathAppendRandom());");
if (file.exists()) {
FileInputStream fin = null;
try {
fin = new FileInputStream(file);
} catch (FileNotFoundException e1) {
e1.printStackTrace();
}
CookieStore cookieStore = null;
ObjectInputStream in;
try {
in = new ObjectInputStream(fin);
cookieStore = (CookieStore) in.readObject();
in.close();
} catch (IOException e) {
logger.error(e);
} catch (ClassNotFoundException e) {
logger.error(e);
}
List<org.apache.http.cookie.Cookie> l = cookieStore.getCookies();
for (org.apache.http.cookie.Cookie temp : l) {
Cookie cookie = new Cookie(temp.getDomain(), temp.getName(), temp.getValue(), temp.getPath(),
temp.getExpiryDate(), false);
webClient.getCookieManager().addCookie(cookie);
}
}
logger.info(Thread.currentThread().getName() + " MyGetPage start,url:" + url);
HtmlPage page = MyGetPage(new StringBuffer(url));
logger.info(Thread.currentThread().getName() + " MyGetPage end,url:" + url);
if (page == null) {
// 采集过程中出现异常的model,可以统一放在一个list中,发送给server重新加入到采集分配队列
logger.info("Page null!");
AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
exceptionFun(model);
return (new StringBuilder("getNullPage"));
}
logger.info(Thread.currentThread().getName() + " builder.append(page.asXml());");
builder.append(page.asXml());
logger.info(Thread.currentThread().getName() + " return builder;");
logger.info(Thread.currentThread().getName() +" CrawlPage $Length="+builder.toString().length());
if(builder.toString().length()<=300){
AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
exceptionFun(model);
return (new StringBuilder("getNullPage"));
}
return builder;
}
/***
* 自定义的getpage,遇到验证码页面识别直至成功
*
*/
private HtmlPage MyGetPage(StringBuffer URL) {
HtmlPage page = null;
boolean flag = true;
int TryTimeCnt = 1;
int UnknowHostTryTimeCnt = 1;
while (flag) {
flag = false;
try {
logger.info(Thread.currentThread().getName() + " webClient.getPage : " + URL + ",CrawlURL_id:"
+ crawlURLId);
page = webClient.getPage(URL.toString());
Document doc = Jsoup.parse(page.asXml());
int robotchecknum = 1;
while (doc.select("title").text().equals("Robot Check")) {
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [Robot Check,URL:" + URL + "]");
String captcha_str = AmazonGetCaptcha.GetCaptcha(new StringBuilder(doc.toString()));
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " end AmazonGetCaptcha.GetCaptcha");
logger.info(dayformat1.format(new Date()) + " " + Thread.currentThread().getName() + " : "
+ captcha_str); HtmlForm form = null; logger.info(Thread.currentThread().getName() + " page.getForms().get(0) Start");
form = page.getForms().get(0);
logger.info(Thread.currentThread().getName() + " page.getForms().get(0) End"); HtmlButton button = null; logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) Start");
button = (HtmlButton) form.getElementsByTagName("button").get(0);
logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) End"); logger.info(Thread.currentThread().getName() + " setValueAttribute Start");
form.getInputByName("field-keywords").setValueAttribute(captcha_str);
logger.info(Thread.currentThread().getName() + " setValueAttribute End"); logger.info(Thread.currentThread().getName() + " button.click Start");
boolean click_flag = false;
while (!click_flag) {
try {
click_flag = true;
page = button.click();
} catch (Exception e1) {
logger.error(Thread.currentThread().getName() + " button.click出错了: " + e1);
//e1.printStackTrace();
click_flag = false;
}
}
logger.info(Thread.currentThread().getName() + " button.click end");
while (page.asXml() == null) {
logger.info(Thread.currentThread().getName() + " page xml null");
logger.info(Thread.currentThread().getName() +" "+ page.asXml());
page.refresh();
logger.info(Thread.currentThread().getName() + " refresh End!");
}
logger.info(Thread.currentThread().getName() + " button.click End"); logger.info(Thread.currentThread().getName() + " Start ParsePage!");
doc = Jsoup.parse(page.asXml());
if (!doc.select("title").text().equals("Robot Check")) {
logger.info(Thread.currentThread().getName() + " " + doc.select("title").text());
logger.info(Thread.currentThread().getName() + " "
+ dayformat1.format(System.currentTimeMillis()) + " [Robot Check,captcha success:"
+ captcha_str + ",try num:" + robotchecknum + "]");
}
robotchecknum++;
} } catch (FailingHttpStatusCodeException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
} catch (MalformedURLException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
}catch(UnknownHostException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
logger.info("found UnknownHostException,start sleep 20 min");
try {
Thread.sleep(1000*60*Integer.parseInt(Configuration.getProperties("unknowhost_sleeptime")));
} catch (InterruptedException e1) {
logger.error(Thread.currentThread().getName() +" "+ e1);
}
logger.info("found UnknownHostException,end sleep 20 min");
UnknowHostTryTimeCnt++;// 访问异常数加一
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [UnknowHostTryTimeCnt:" + UnknowHostTryTimeCnt + "]");
if (UnknowHostTryTimeCnt > Integer.parseInt(Configuration.getProperties("unknowhost_maxtrytime"))) {
return null;
}
}catch (Exception eq) {
logger.error(Thread.currentThread().getName() + " "+eq);
TryTimeCnt++;// 访问异常数加一
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [TryTimeCnt:" + TryTimeCnt + "]");
if (TryTimeCnt > 5) {
return null;
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
logger.error(Thread.currentThread().getName() + e);
}
flag = true;
}
try {
Thread.sleep(random.nextInt(500) + 1500);
} catch (InterruptedException e) {
logger.error(Thread.currentThread().getName() + e);
flag = true;
}
}
return page;
}

Java WebClient 总结的更多相关文章

  1. Spark案例分析

    一.需求:计算网页访问量前三名 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /* ...

  2. C#调用JAVA接口WSSE方式用WebClient方式

    C#读取JAVA的WSSE接口的调用代码: 用webclient 方式: /// <summary> /// 调用java cxf ws_security加密的服务wcf客户端对应的加密类 ...

  3. websocket通信 实现java模拟一个client与webclient通信

    发文原由: 熟悉socket通信的同学,对于socket模拟server与client,实现相互通信, 或者使用websocket与java模拟的websocket服务器通信(比如一个聊天室),对于这 ...

  4. Java调用Http/Https接口(7,end)--WebClient调用Http/Https接口

    WebClient是Spring提供的非阻塞.响应式的Http客户端,提供同步及异步的API,将会代替RestTemplate及AsyncRestTemplate.文中所使用到的软件版本:Java 1 ...

  5. java.lang.UnsupportedClassVersionError: com/gargoylesoftware/htmlunit/WebClient : Unsupported major.minor version 52.0 (unable to load class com.gargoylesoftware.htmlunit.WebClient)

    java.lang.UnsupportedClassVersionError: com/gargoylesoftware/htmlunit/WebClient : Unsupported major. ...

  6. htmlunit学习之java.lang.NoSuchMethodError: com.gargoylesoftware.htmlunit.WebClient.getOptions()Lcom/gargoylesoftware/htmlunit/WebClientOptions;

    运行到这里就报错 java.lang.NoSuchMethodError: com.gargoylesoftware.htmlunit.WebClient.getOptions()Lcom/gargo ...

  7. webClient请求JAVA超时解决方案

    private class MyWebClient: WebClient { protected override WebRequest GetWebRequest(Uri uri) { WebReq ...

  8. C#、JAVA操作Hadoop(HDFS、Map/Reduce)真实过程概述。组件、源码下载。无法解决:Response status code does not indicate success: 500。

    一.Hadoop环境配置概述 三台虚拟机,操作系统为:Ubuntu 16.04. Hadoop版本:2.7.2 NameNode:192.168.72.132 DataNode:192.168.72. ...

  9. Atitit.http httpclient实践java c# .net php attilax总结

    Atitit.http httpclient实践java c# .net php attilax总结 1. Navtree>> net .http1 2. Httpclient理论1 2. ...

随机推荐

  1. php瀑布流,把一个数组分4个数组,按照时间排序

    简单介绍:把一个数组分成4个数组,取其中1的倍数 <?php $arr = array( ', ', ', ', ', ', ', ', ', ', ', ', ', ); foreach($a ...

  2. 极光推送 JPush 项目简单使用

    打开或者关闭推送 - (void)pushSwitch:(UISwitch *)sender { if (sender.on) { [[NSUserDefaults standardUserDefau ...

  3. JAVA中的聚集和组合的区别和联系

    选自<JAVA语言程序设计-基础篇(原书第8版)> 定义:一个对象可以包含另一个对象.这两个对象之间的关系称为组合(composition). 组合实际上是聚集关系的一种特殊形式.聚集模拟 ...

  4. imageserver

    https://bitbucket.org/tamtam-nl/tamtam-nuget-imageserver/overview https://www.nuget.org/packages/Tam ...

  5. Winform端上传图片到服务器

    转载自  在winform实现文件上传到服务器 webform上传文件可能大家都写过很多,一个HtmlInputFile.PostedFile.SaveAs就搞定了,不过不知道大家有没有在winfor ...

  6. knockout-validation不自动插入错误消息

    <div data-bind="validationOptions:{insertMessages:false}"> <div class="valid ...

  7. Spring4学习笔记-AOP

    1.加入jar包 com.springsource.org.aopalliance-1.0.0.jar com.springsource.org.aspectj.weaver-1.6.8.RELEAS ...

  8. 解决安装VS2013提示“已停止工作”问题

    新安装操作系统(win8.1),手动安装各种驱动,安装VS2013,报错,见下图: 原因:显卡驱动问题. 解决办法:卸载intel显卡驱动这碧池.(系统会自动给你适配合适的)

  9. java获取指定路径下的指定文件/java.io.File.listFiles(FilenameFilter filter)

    java.io.File.listFiles(FilenameFilter filter) 返回抽象路径名数组,表示在目录中此抽象路径名表示,满足指定过滤器的文件和目录. 声明 以下是java.io. ...

  10. 将Ubuntu 15.10升级到Ubuntu 16.04

    Ubuntu 16.04 LTS 代号为 Xenial Xerus,其最终版将于 2016 年 4 月 21 日正式发布,Ubuntu16.04 将是非常受欢迎的开源操作系统 Ubuntu 的第 6 ...