java访问网络资源 由底层到封装  为  scoket==> java.net.HttpURLConnection==>HttpClient

这次阐述先 java.net.HttpURLConnection 的方式 ,好处是用导包 ,jdk原生自带的。

HtmlUtil 包含尝试重连(3次) ,编码识别,保存文件到磁盘

package com.cph.crawler.core.utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; /**
* 类说明:html有关的操作 <br>
* --22下午08::20创建<br>
*
* @author cphmvp
*/
public final class HtmlUtil {
public final static Log LOG = LogFactory.getLog(HtmlUtil.class);
static String defaultEncoding = "utf-8";
static HttpURLConnection httpURLConnection = null;
static URL urlModel = null;
// 链接超时时间
static int connectTimeout = ;
// 读取响应超时时间
static int readTimeout = ; /**
* 下载图片<br>
*
* @param url
* 图片的下载地址<br>
* @param savePath
* 保存路径<br>
* @throws IOException
*/
@SuppressWarnings("resource")
public static void downloadAndSavePictureToDisk(String url, String savePath)
throws IOException {
urlModel = new URL(url);
httpURLConnection = (HttpURLConnection) urlModel.openConnection();
httpURLConnection.setConnectTimeout(connectTimeout);
httpURLConnection.setReadTimeout(readTimeout);
httpURLConnection.setDoOutput(true);
InputStream is = httpURLConnection.getInputStream();
BufferedReader rd = new BufferedReader(new InputStreamReader(is));
FileOutputStream fw = null;
File f = new File(savePath.substring(, savePath.lastIndexOf("/"))); if (!f.exists()) {
f.mkdirs();
}
File eixtsFile = new File(savePath);
if (eixtsFile.exists()) {
return;
}
fw = new FileOutputStream(savePath, true);
int num = -;
while ((num = is.read()) != (-))// 是否读完所有数据
{
fw.write(num);// 将数据写往文件
}
rd.close();
is.close();
if (httpURLConnection != null) {
httpURLConnection.disconnect();
} } /**
* 讲url后面的参数进行编码
*
* @param url
* @return
* @throws UnsupportedEncodingException
*/
private static String encodParamters(String url)
throws UnsupportedEncodingException {
String returnStr = new String(url);
String regex = "=([^&]+)";
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(url);
while (m.find()) {
String replaceStr = m.group();
returnStr = returnStr.replaceFirst(replaceStr,
URLEncoder.encode(replaceStr, "utf-8"));
}
return returnStr;
} /**
* 获取会话的JSESSIONID
*
* @param url
* @return
*/
public static String getSession(String url) {
String sessionId = "";
try {
urlModel = new URL(url);
httpURLConnection = (HttpURLConnection) urlModel.openConnection();
httpURLConnection.setConnectTimeout(connectTimeout);
httpURLConnection.setReadTimeout(readTimeout);
String cookieVal = null;
String key = null;
for (int i = ; (key = httpURLConnection.getHeaderFieldKey(i)) != null; i++) {
if (key.equalsIgnoreCase("set-cookie")) {
cookieVal = httpURLConnection.getHeaderField(i);
cookieVal = cookieVal.substring(, cookieVal.indexOf(";"));
sessionId = sessionId + cookieVal + ";";
}
} } catch (MalformedURLException e) {
LOG.error(e);
} catch (IOException e) {
LOG.error(e);
}
return sessionId;
} /**
* 下载页面</br>
*
* @param page
* </br>
* @return 页面源码
* @throws IOException
* @throws UnsupportedEncodingException
*/
public static StringBuffer downloadHtml(String url,String encoding) {
StringBuffer sb = new StringBuffer();
BufferedReader in = null;
int tryNum = ;
while (true) {
try {
if (tryNum > ) {
String ecodingUrl = encodParamters(url);
urlModel = new URL(ecodingUrl);
} else {
urlModel = new URL(url);
}
httpURLConnection = (HttpURLConnection) urlModel
.openConnection();
httpURLConnection.setConnectTimeout(connectTimeout);
httpURLConnection.setReadTimeout(readTimeout);
httpURLConnection
.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");
String redirectUrl = httpURLConnection.getURL().toString();
if (!redirectUrl.equals(url)) {
LOG.info(url + "重定向后为" + redirectUrl);
}
String charSetHeader = httpURLConnection
.getHeaderField("Content-Type");
String charSet = null;
if (charSetHeader != null) {
Pattern p = Pattern.compile("charset=[\"']?(.*?)['\"]");
Matcher m = p.matcher(charSetHeader);
if (m.find())
charSet = m.group().trim();
if (null == charSet) {
charSet = encoding;
}
} charSet = (charSet == null ? encoding : charSet);
in = new BufferedReader(new InputStreamReader(
httpURLConnection.getInputStream(), charSet));
String inputLine;
while ((inputLine = in.readLine()) != null) {
sb.append(inputLine + "\n");
inputLine = null;
}
if (in != null)
try {
in.close();
} catch (IOException e) {
LOG.error(e);
}
if (httpURLConnection != null)
httpURLConnection.disconnect();
break;
} catch (Exception e) {
if (tryNum++ == ) {
LOG.error("download page error [ " + urlModel + " ] ");
return null;
}
LOG.warn(tryNum + "次下载失败", e);
}
}
return sb; }
/**
* 下载页面</br>
*
* @param page
* </br>
* @return 页面源码
* @throws IOException
* @throws UnsupportedEncodingException
*/
public static StringBuffer downloadHtml(String url) {
StringBuffer sb = new StringBuffer();
BufferedReader in = null;
int tryNum = ;
while (true) {
try {
if (tryNum > ) {
String ecodingUrl = encodParamters(url);
urlModel = new URL(ecodingUrl);
} else {
urlModel = new URL(url);
}
httpURLConnection = (HttpURLConnection) urlModel
.openConnection();
httpURLConnection.setConnectTimeout(connectTimeout);
httpURLConnection.setReadTimeout(readTimeout);
httpURLConnection
.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");
String redirectUrl = httpURLConnection.getURL().toString();
if (!redirectUrl.equals(url)) {
LOG.info(url + "重定向后为" + redirectUrl);
}
String charSetHeader = httpURLConnection
.getHeaderField("Content-Type");
String charSet = null;
if (charSetHeader != null) {
Pattern p = Pattern.compile("charset=[\"']?(.*?)['\"]");
Matcher m = p.matcher(charSetHeader);
if (m.find())
charSet = m.group().trim();
if (null == charSet) {
charSet = defaultEncoding;
}
} charSet = (charSet == null ? defaultEncoding : charSet);
in = new BufferedReader(new InputStreamReader(
httpURLConnection.getInputStream(), charSet));
String inputLine;
while ((inputLine = in.readLine()) != null) {
sb.append(inputLine + "\n");
inputLine = null;
}
if (in != null)
try {
in.close();
} catch (IOException e) {
LOG.error(e);
}
if (httpURLConnection != null)
httpURLConnection.disconnect();
break;
} catch (Exception e) {
if (tryNum++ == ) {
LOG.error("download page error [ " + urlModel + " ] ");
return null;
}
LOG.warn(tryNum + "次下载失败", e);
}
}
return sb; } }

                             

crawler_基础之_java.net.HttpURLConnection 访问网络资源的更多相关文章

  1. 使用VC建立网络连接并访问网络资源

    目录 1. 提出问题 2. 解决方案 1. 提出问题 在windows下可以通过系统操作,将局域网的资源映射到本地,从而实现像本地数据一样访问网络资源.实际上这些步骤也可通过代码调用win32函数实现 ...

  2. java成神之——HttpURLConnection访问api

    HttpURLConnection 访问get资源 访问post资源 访问Delete资源 获取状态码 结语 HttpURLConnection 访问get资源 HttpURLConnection c ...

  3. 【CUDA 基础】4.3 内存访问模式

    title: [CUDA 基础]4.3 内存访问模式 categories: - CUDA - Freshman tags: - 内存访问模式 - 对齐 - 合并 - 缓存 - 结构体数组 - 数组结 ...

  4. crawler_基础之_httpclient 访问网络资源

    先粘贴一个 简单版的,后期再修改 pom文件 <dependency> <groupId>org.apache.httpcomponents</groupId> & ...

  5. 简单使用URLConnection、HttpURLConnection和HttpClient访问网络资源

    URL的openConnection方法将返回一个URLConnection,该对象表示应用程序和URL之间的通信连接.程序可以通过它的实例向该URL发送请求,读取URL引用的资源. 下面通过一个简单 ...

  6. Java多线程基础——对象及变量并发访问

    在开发多线程程序时,如果每个多线程处理的事情都不一样,每个线程都互不相关,这样开发的过程就非常轻松.但是很多时候,多线程程序是需要同时访问同一个对象,或者变量的.这样,一个对象同时被多个线程访问,会出 ...

  7. 关于安卓开发当中通过java自带的HttpURLConnection访问XML的java.io.EOFException问题

    刚接触安卓开发,试着写个小程序熟悉下,就写了天气预报的小程序,通过httpUrlConnection读流的方式来获取网络公共接口提供的天气XML信息.但在建立http连接时一直报java.io.EOF ...

  8. 通过HTTP访问网络资源

    添加访问网络的权限:<uses-permission android:name="android.permission.INTERNET"/> package com. ...

  9. Android网络:HTTP之利用HttpURLConnection访问网页、获取网络图片实例 (附源码)

    http://blog.csdn.net/yanzi1225627/article/details/22222735 如前文所示的TCP局域网传送东西,除了对传输层的TCP/UDP支持良好外,Andr ...

随机推荐

  1. poj 1384 Piggy-Bank(全然背包)

    http://poj.org/problem?id=1384 Piggy-Bank Time Limit: 1000MS Memory Limit: 10000K Total Submissions: ...

  2. 【C++探索之旅】第一部分第三课:第一个C++程序

    内容简介 1.第一部分第三课:第一个C++程序 2.第一部分第四课预告:内存的使用 第一个C++程序 经过上两课之后,我们已经知道了什么是编程,编程的语言,编程的必要软件,C++是什么,我们也安装了适 ...

  3. STL源代码剖析(一) - 内存分配

    Allocaor allocator 指的是空间配置器,用于分配内存.STL中默认使用SGI STL alloc作为STL的内存分配器,尽管未能符合标准规格,但效率上更好.SGI STL也定义有一个符 ...

  4. jQuery 代码的层定位滑动动画效果

    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...

  5. hive内置函数大全

    ====================================== 一.关系函数 1.等值比較:=     语法:A=B 操作类型:全部基本类型 2.不等值比較:<>     语 ...

  6. W3C DOM 事件模型(简述)

    1.事件模型 由于事件捕获与冒泡模型都有其长处和解释,DOM标准支持捕获型与冒泡型,能够说是它们两者的结合体.它能够在一个DOM元素上绑定多个事件处理器,而且在处理函数内部,thiskeyword仍然 ...

  7. curl转让query string逃生参数

    假设curl访问http网站.传递参数.需要使用\如&字首. 例: http://myjenkins/job/run_schedule/buildWithParameters?token=fe ...

  8. [HA]负载均衡:HAPROXY与KEEPALIVED强强联合

    第一步:更改系统控制配置文件,同意分配虚拟IP(VIP) /etc/sysctl.conf net.ipv4.ip_nonlocal_bind=1 <pre style="word-w ...

  9. Repository、IUnitOfWork和IDbContext

    DDD 领域驱动设计-谈谈Repository.IUnitOfWork和IDbContext的实践 上一篇:<DDD 领域驱动设计-谈谈 Repository.IUnitOfWork 和 IDb ...

  10. Windows store 验证你的 URL http:// 和 https:// ms-appx:/// ms-appdata:///local

    前缀 使用 注意事项 http:// 和 https:// 联机存储的图像 这些图像可能缓存在本地,因此图像服务器可能未收到图像的请求.可以在这些 URL 中附加查询字符串.确保 Web 服务器返回原 ...