java访问网络资源 由底层到封装  为  scoket==> java.net.HttpURLConnection==>HttpClient

这次阐述先 java.net.HttpURLConnection 的方式 ,好处是用导包 ,jdk原生自带的。

HtmlUtil 包含尝试重连(3次) ,编码识别,保存文件到磁盘

package com.cph.crawler.core.utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; /**
* 类说明:html有关的操作 <br>
* --22下午08::20创建<br>
*
* @author cphmvp
*/
public final class HtmlUtil {
public final static Log LOG = LogFactory.getLog(HtmlUtil.class);
static String defaultEncoding = "utf-8";
static HttpURLConnection httpURLConnection = null;
static URL urlModel = null;
// 链接超时时间
static int connectTimeout = ;
// 读取响应超时时间
static int readTimeout = ; /**
* 下载图片<br>
*
* @param url
* 图片的下载地址<br>
* @param savePath
* 保存路径<br>
* @throws IOException
*/
@SuppressWarnings("resource")
public static void downloadAndSavePictureToDisk(String url, String savePath)
throws IOException {
urlModel = new URL(url);
httpURLConnection = (HttpURLConnection) urlModel.openConnection();
httpURLConnection.setConnectTimeout(connectTimeout);
httpURLConnection.setReadTimeout(readTimeout);
httpURLConnection.setDoOutput(true);
InputStream is = httpURLConnection.getInputStream();
BufferedReader rd = new BufferedReader(new InputStreamReader(is));
FileOutputStream fw = null;
File f = new File(savePath.substring(, savePath.lastIndexOf("/"))); if (!f.exists()) {
f.mkdirs();
}
File eixtsFile = new File(savePath);
if (eixtsFile.exists()) {
return;
}
fw = new FileOutputStream(savePath, true);
int num = -;
while ((num = is.read()) != (-))// 是否读完所有数据
{
fw.write(num);// 将数据写往文件
}
rd.close();
is.close();
if (httpURLConnection != null) {
httpURLConnection.disconnect();
} } /**
* 讲url后面的参数进行编码
*
* @param url
* @return
* @throws UnsupportedEncodingException
*/
private static String encodParamters(String url)
throws UnsupportedEncodingException {
String returnStr = new String(url);
String regex = "=([^&]+)";
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(url);
while (m.find()) {
String replaceStr = m.group();
returnStr = returnStr.replaceFirst(replaceStr,
URLEncoder.encode(replaceStr, "utf-8"));
}
return returnStr;
} /**
* 获取会话的JSESSIONID
*
* @param url
* @return
*/
public static String getSession(String url) {
String sessionId = "";
try {
urlModel = new URL(url);
httpURLConnection = (HttpURLConnection) urlModel.openConnection();
httpURLConnection.setConnectTimeout(connectTimeout);
httpURLConnection.setReadTimeout(readTimeout);
String cookieVal = null;
String key = null;
for (int i = ; (key = httpURLConnection.getHeaderFieldKey(i)) != null; i++) {
if (key.equalsIgnoreCase("set-cookie")) {
cookieVal = httpURLConnection.getHeaderField(i);
cookieVal = cookieVal.substring(, cookieVal.indexOf(";"));
sessionId = sessionId + cookieVal + ";";
}
} } catch (MalformedURLException e) {
LOG.error(e);
} catch (IOException e) {
LOG.error(e);
}
return sessionId;
} /**
* 下载页面</br>
*
* @param page
* </br>
* @return 页面源码
* @throws IOException
* @throws UnsupportedEncodingException
*/
public static StringBuffer downloadHtml(String url,String encoding) {
StringBuffer sb = new StringBuffer();
BufferedReader in = null;
int tryNum = ;
while (true) {
try {
if (tryNum > ) {
String ecodingUrl = encodParamters(url);
urlModel = new URL(ecodingUrl);
} else {
urlModel = new URL(url);
}
httpURLConnection = (HttpURLConnection) urlModel
.openConnection();
httpURLConnection.setConnectTimeout(connectTimeout);
httpURLConnection.setReadTimeout(readTimeout);
httpURLConnection
.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");
String redirectUrl = httpURLConnection.getURL().toString();
if (!redirectUrl.equals(url)) {
LOG.info(url + "重定向后为" + redirectUrl);
}
String charSetHeader = httpURLConnection
.getHeaderField("Content-Type");
String charSet = null;
if (charSetHeader != null) {
Pattern p = Pattern.compile("charset=[\"']?(.*?)['\"]");
Matcher m = p.matcher(charSetHeader);
if (m.find())
charSet = m.group().trim();
if (null == charSet) {
charSet = encoding;
}
} charSet = (charSet == null ? encoding : charSet);
in = new BufferedReader(new InputStreamReader(
httpURLConnection.getInputStream(), charSet));
String inputLine;
while ((inputLine = in.readLine()) != null) {
sb.append(inputLine + "\n");
inputLine = null;
}
if (in != null)
try {
in.close();
} catch (IOException e) {
LOG.error(e);
}
if (httpURLConnection != null)
httpURLConnection.disconnect();
break;
} catch (Exception e) {
if (tryNum++ == ) {
LOG.error("download page error [ " + urlModel + " ] ");
return null;
}
LOG.warn(tryNum + "次下载失败", e);
}
}
return sb; }
/**
* 下载页面</br>
*
* @param page
* </br>
* @return 页面源码
* @throws IOException
* @throws UnsupportedEncodingException
*/
public static StringBuffer downloadHtml(String url) {
StringBuffer sb = new StringBuffer();
BufferedReader in = null;
int tryNum = ;
while (true) {
try {
if (tryNum > ) {
String ecodingUrl = encodParamters(url);
urlModel = new URL(ecodingUrl);
} else {
urlModel = new URL(url);
}
httpURLConnection = (HttpURLConnection) urlModel
.openConnection();
httpURLConnection.setConnectTimeout(connectTimeout);
httpURLConnection.setReadTimeout(readTimeout);
httpURLConnection
.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");
String redirectUrl = httpURLConnection.getURL().toString();
if (!redirectUrl.equals(url)) {
LOG.info(url + "重定向后为" + redirectUrl);
}
String charSetHeader = httpURLConnection
.getHeaderField("Content-Type");
String charSet = null;
if (charSetHeader != null) {
Pattern p = Pattern.compile("charset=[\"']?(.*?)['\"]");
Matcher m = p.matcher(charSetHeader);
if (m.find())
charSet = m.group().trim();
if (null == charSet) {
charSet = defaultEncoding;
}
} charSet = (charSet == null ? defaultEncoding : charSet);
in = new BufferedReader(new InputStreamReader(
httpURLConnection.getInputStream(), charSet));
String inputLine;
while ((inputLine = in.readLine()) != null) {
sb.append(inputLine + "\n");
inputLine = null;
}
if (in != null)
try {
in.close();
} catch (IOException e) {
LOG.error(e);
}
if (httpURLConnection != null)
httpURLConnection.disconnect();
break;
} catch (Exception e) {
if (tryNum++ == ) {
LOG.error("download page error [ " + urlModel + " ] ");
return null;
}
LOG.warn(tryNum + "次下载失败", e);
}
}
return sb; } }

                             

crawler_基础之_java.net.HttpURLConnection 访问网络资源的更多相关文章

  1. 使用VC建立网络连接并访问网络资源

    目录 1. 提出问题 2. 解决方案 1. 提出问题 在windows下可以通过系统操作,将局域网的资源映射到本地,从而实现像本地数据一样访问网络资源.实际上这些步骤也可通过代码调用win32函数实现 ...

  2. java成神之——HttpURLConnection访问api

    HttpURLConnection 访问get资源 访问post资源 访问Delete资源 获取状态码 结语 HttpURLConnection 访问get资源 HttpURLConnection c ...

  3. 【CUDA 基础】4.3 内存访问模式

    title: [CUDA 基础]4.3 内存访问模式 categories: - CUDA - Freshman tags: - 内存访问模式 - 对齐 - 合并 - 缓存 - 结构体数组 - 数组结 ...

  4. crawler_基础之_httpclient 访问网络资源

    先粘贴一个 简单版的,后期再修改 pom文件 <dependency> <groupId>org.apache.httpcomponents</groupId> & ...

  5. 简单使用URLConnection、HttpURLConnection和HttpClient访问网络资源

    URL的openConnection方法将返回一个URLConnection,该对象表示应用程序和URL之间的通信连接.程序可以通过它的实例向该URL发送请求,读取URL引用的资源. 下面通过一个简单 ...

  6. Java多线程基础——对象及变量并发访问

    在开发多线程程序时,如果每个多线程处理的事情都不一样,每个线程都互不相关,这样开发的过程就非常轻松.但是很多时候,多线程程序是需要同时访问同一个对象,或者变量的.这样,一个对象同时被多个线程访问,会出 ...

  7. 关于安卓开发当中通过java自带的HttpURLConnection访问XML的java.io.EOFException问题

    刚接触安卓开发,试着写个小程序熟悉下,就写了天气预报的小程序,通过httpUrlConnection读流的方式来获取网络公共接口提供的天气XML信息.但在建立http连接时一直报java.io.EOF ...

  8. 通过HTTP访问网络资源

    添加访问网络的权限:<uses-permission android:name="android.permission.INTERNET"/> package com. ...

  9. Android网络:HTTP之利用HttpURLConnection访问网页、获取网络图片实例 (附源码)

    http://blog.csdn.net/yanzi1225627/article/details/22222735 如前文所示的TCP局域网传送东西,除了对传输层的TCP/UDP支持良好外,Andr ...

随机推荐

  1. 【C语言探索之旅】 第二部分第十课:练习题和习作

    内容简介 1.课程大纲 2.第二部分第十一课:  练习题和习作 3.第三部分第一课预告:  安装SDL 课程大纲 我们的课程分为四大部分,每一个部分结束后都会有练习题,并会公布答案.还会带大家用C语言 ...

  2. 《编程简介(Java) &#183;10.3递归思想》

    <编程简介(Java) ·10.3递归思想> 10.3.1 递归的概念 以两种方式的人:男人和女人:算法是两种:递归迭代/通知: 递归方法用自己的较简单的情形定义自己. 在数学和计算机科学 ...

  3. Android学习小Demo(19)利用Loader来实时接收短信

    之前写过一篇文章<Android学习小Demo(13)Android中关于ContentObserver的使用>,在里面利用ContentOberver去监測短信URI内容的变化.我们先来 ...

  4. swift 它们的定义TabBarItem

    1.效果图     2.NewsViewController.swift // // NewsViewController.swift // NavigationDemo // // Created ...

  5. WPF学习(8)数据绑定

    说到数据绑定,其实这并不是一个新的玩意儿.了解asp.net的朋友都知道,在asp.net中已经用到了这个概念,例如Repeater等的数据绑定.那么,在WPF中的数据绑定相比较传统的asp.net中 ...

  6. Python标准库简介

    在<Python语言参考手册>描述中的描述Python语法和语义,而本手冊主要介绍了Python标准库的内容和使用,也介绍了一些发行库里可选的组件库. Python标准库包括的内容是非常广 ...

  7. css3 3d旋转动画

    <!doctype html> <html> <head> <meta charset="utf-8"> <title> ...

  8. http报错之return error code:401 unauthorized

     http报错之return error code:401 unauthorized 依据HTTP返回码所表示的意思应该是未授权,没有输入账号和password,因此解决方法就直接在HTTP包里面 ...

  9. oracle connect by 说明

    Oracle能够通过START WITH . . . CONNECT BY . . .子句来实现SQL分层查询,这递归查询 例如: select level||'月' 月份 from dual con ...

  10. unity3d 依据指定的Assets下的目录路径 返回这个路径下的全部文件名称

    using UnityEngine; using System.Collections; using System.Collections.Generic; using System.IO; < ...