a code snip
import java.util.ArrayList;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import com.creditcloud.brick.task.CrawlTask;
import com.creditcloud.brick.task.Extractor;
import com.creditcloud.brick.task.Field; @NoArgsConstructor
@AllArgsConstructor
@Data
@Slf4j
public class DataCrawler {
String url;
String space; public HashMap<String, Object> doCrawl(CrawlTask task ) {
HashMap<String, Object> result = new HashMap<String, Object>();
this.url = task.getUrl();
this.space = task.getId();
//
String content=this.doGet(url);
if( StringUtils.isNotEmpty(content)) {
Extractor actor=task.getExtractor();
if( actor != null )
this.parse(actor, content, result);
}
return result;
} public String doGet( String url ) {
String data=null;
//return Jsoup.connect(url).userAgent("Mozilla").get();
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet hGet = new HttpGet(url);
log.info(url);
CloseableHttpResponse response = null;
try {
response = httpclient.execute(hGet);
HttpEntity entity = response.getEntity();
System.out.println(response.getStatusLine());
log.info( response.getStatusLine().toString() );
//
if (entity != null) {
System.out.println("Response content length: " + entity.getContentLength());
data = EntityUtils.toString(entity);
System.out.println(data);
EntityUtils.consume(entity); }
}
catch(Exception e){
log.error(e.getMessage());
}
//response
try
{
if( response != null )
response.close();
}
catch(Exception e) {
log.error(e.getMessage());
}
//httpclient
try {
if( httpclient != null )
httpclient.close();
} catch (Exception e) {
log.error(e.getMessage());
} return data;
} public String removeHtmlLabel( String input ) {
return input.replaceAll("<[^>]+>", "").replaceAll(" "," ").trim();
} //
public ArrayList<String> match( Extractor extractor, String input ) {
ArrayList<String> result = new ArrayList<String>();
switch (extractor.getType()) {
case css: //call css
{
Document doc = Jsoup.parse(input);
Elements elems = doc.select(extractor.getPattern());
for( Element elem:elems ) {
result.add( elem.toString() );
}
}
break;
case regex: //call regex
{
Pattern p = Pattern.compile( extractor.getPattern());
Matcher m = p.matcher( input );
String matchValue = null;
while(m.find()) {
matchValue = StringEscapeUtils.unescapeHtml4( m.group());
result.add(matchValue);
}
}
break;
case empty:
result.add(input);
break;
}
return result;
} public void parse( Extractor extractor, String input, HashMap<String, Object> result ) {
//1. match by css or regex
ArrayList<String> strlist = this.match(extractor, input);
if( strlist.isEmpty() ) {
//result.put( extractor.getId(), null);
return;
}
//2. call children extractors
switch(extractor.getData()) {
case array:{
//result.setType(ResultDataType.array);
ArrayList<HashMap<String, Object>> list = new ArrayList<HashMap<String, Object>>();
for( String str:strlist ) {
HashMap<String, Object> childResult = new HashMap<String, Object>();
for( Extractor one:extractor.getChildren()) {
this.parse(one, str, childResult);
}
if( childResult.isEmpty() == false )
list.add(childResult);
}
if(list.isEmpty() == false )
result.put( extractor.getId(), list );
}
break;
case field:{
for(Field fd:extractor.getFields()) {
String val=strlist.get( fd.getIndex() );
result.put( fd.getName(), this.removeHtmlLabel(val) );
}
}
break;
case none: {
for( String str:strlist ) {
for( Extractor one:extractor.getChildren()) {
this.parse(one, str, result);
}
}
}
break;
}
}
}
a code snip的更多相关文章
- CSS code snip enjoy.
<!-- information-total得是动态获取吧. --> <div class="information-mod"> <div class ...
- C# Code Snip
1.Tryf + TAB+TAB try { } finally { } 2.Prop+Tab+Tab public int MyProperty { get; set; } 3. #region + ...
- WPF整理-跨程序集访问资源
“Sometimes binary resources are defined in one assembly (typically a class library), but areneeded i ...
- WPF整理-使用用户选择主题的颜色和字体
“Sometimes it's useful to use one of the selected colors or fonts the user has chosen in theWindows ...
- WPF整理-XAML访问静态属性
"XAML provides an easy way to set values of properties—type converters and the extended propert ...
- WPF整理-XAML构建后台类对象
1.XAML 接触WPF的第一眼就是XAML---XAML是用来描绘界面的.其实不然! "Actually, XAML has nothing to do with UI. It's mer ...
- Call C# in powershell
How to call C# code in powershell Powershell Command Add-Type usage of Add-Type we use Add-Type -Typ ...
- Windows Phone 8 开发必备资源
一.MVVM框架推荐 1. MVVM-Light 这个框架是我最常用的MVVM框架之一,它比Prism更轻量级,但对于一般的小应用,功能足够. 官方网站:http://mvvmlight.codepl ...
- 字符串的驻留(String Interning)
http://www.cnblogs.com/artech/archive/2007/03/04/663728.html 关于字符串的驻留的机制,对于那些了解它的人肯定会认为很简单,但是我相信会有很大 ...
随机推荐
- hadoop2.5发布:最新编译 32位、64位安装、源码包、API以及新特性
hadoop2.5发布:最新编译 32位.64位安装.源码包.API以及新特性 http://www.aboutyun.com/thread-8751-1-1.html (出处: about云开发) ...
- Emacs和它的朋友们——阅读源代码篇(转)
正如那本<Code Reading>一书中指出的那样,源代码阅读一直没有被很好的重 视:你上大学的时候有“代码阅读”这门课吗?相信没有. 1 Source Insight 谈到阅读源代码, ...
- 你真的了解一段Java程序的生命史吗
作为一名程序猿 ,我们每天都在写Code,但你真的了解它的生命周期么?今天就来简单聊下它的生命历程,说起一段Java Code,从出生到game over大体分这么几步:编译.类加载.运行.GC. 编 ...
- gulp之静态资源防缓存处理
最近,因为校友网项目开始有些规模了.开始就要考虑对静态资源进行工程自动化的管理.一讲到前端的自动化工具,大家或许都会想到Grunt,Gulp,或者百度的FIS.这三个都有各自的特点,大家可以依据自己的 ...
- Hibernate之HQL查询
一.Hibernate 提供了以下几种检索对象的方式: 导航对象图检索方式: 根据已经加载的对象导航到其他对象 OID 检索方式: 按照对象的 OID 来检索对象 HQL 检索方式:使用面向对象的 H ...
- Activex WindowsMediaPlayer控件主要方法属性
属性/方法名: 说明:[基本属性] URL:String; 指定媒体位置,本机或网络地址 uiMode:String; 播放器界面模式,可为Full, Mini, None, Invisible pl ...
- 用C#调用蓝牙编程
2013-04-22 09:41:06 什么是蓝牙? 现在只能手机这么发达,蓝牙对我们来说肯定不陌生.我来介绍一下官方概念: 蓝牙,是一种支持设备短距离通信(一般10m内)的无线电技术.能在包括移动电 ...
- kotlin使用手记
几个月前接触过scala,当时下载一个库用maven引入,弄了很久,后来觉得没起一个项目有点重量级,一次偶然机会在升级idea的时候,发现jetbrains官网出了一门新的jvm编程语言kotlin, ...
- CSS基础(02)
CSS 选择器 1.CSS3 选择器简介 在 CSS 中,选择器是一种模式,用于选择需要添加样式的元素. 语法: 下面中"CSS" 列指示该属性是在哪个 CSS 版本中定义的.(C ...
- 用shader使图片背景透明
转自:http://blog.csdn.net/dawn_moon/article/details/8631783 好吧,终于抽时间写这篇文章了. 手头上有很多人物行走图,技能特效图等,但这些图都有个 ...