crawler_基础之_java.net.HttpURLConnection 访问网络资源

java访问网络资源由底层到封装为 scoket==> java.net.HttpURLConnection==>HttpClient

这次阐述先 java.net.HttpURLConnection 的方式，好处是用导包，jdk原生自带的。

HtmlUtil 包含尝试重连（3次），编码识别，保存文件到磁盘

package com.cph.crawler.core.utils;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.UnsupportedEncodingException;

import java.net.HttpURLConnection;

import java.net.MalformedURLException;

import java.net.URL;

import java.net.URLEncoder;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

/**

 * 类说明：html有关的操作 <br>

 * --22下午08::20创建<br>

 *

 * @author cphmvp

 */

public final class HtmlUtil {

    public final static Log LOG = LogFactory.getLog(HtmlUtil.class);

    static String defaultEncoding = "utf-8";

    static HttpURLConnection httpURLConnection = null;

    static URL urlModel = null;

    // 链接超时时间

    static int connectTimeout = ;

    // 读取响应超时时间

    static int readTimeout = ;

    /**

     * 下载图片<br>

     *

     * @param url

     *            图片的下载地址<br>

     * @param savePath

     *            保存路径<br>

     * @throws IOException

     */

    @SuppressWarnings("resource")

    public static void downloadAndSavePictureToDisk(String url, String savePath)

            throws IOException {

        urlModel = new URL(url);

        httpURLConnection = (HttpURLConnection) urlModel.openConnection();

        httpURLConnection.setConnectTimeout(connectTimeout);

        httpURLConnection.setReadTimeout(readTimeout);

        httpURLConnection.setDoOutput(true);

        InputStream is = httpURLConnection.getInputStream();

        BufferedReader rd = new BufferedReader(new InputStreamReader(is));

        FileOutputStream fw = null;

        File f = new File(savePath.substring(, savePath.lastIndexOf("/")));

        if (!f.exists()) {

            f.mkdirs();

        }

        File eixtsFile = new File(savePath);

        if (eixtsFile.exists()) {

            return;

        }

        fw = new FileOutputStream(savePath, true);

        int num = -;

        while ((num = is.read()) != (-))// 是否读完所有数据

        {

            fw.write(num);// 将数据写往文件

        }

        rd.close();

        is.close();

        if (httpURLConnection != null) {

            httpURLConnection.disconnect();

        }

    }

    /**

     * 讲url后面的参数进行编码

     *

     * @param url

     * @return

     * @throws UnsupportedEncodingException

     */

    private static String encodParamters(String url)

            throws UnsupportedEncodingException {

        String returnStr = new String(url);

        String regex = "=([^&]+)";

        Pattern p = Pattern.compile(regex);

        Matcher m = p.matcher(url);

        while (m.find()) {

            String replaceStr = m.group();

            returnStr = returnStr.replaceFirst(replaceStr,

                    URLEncoder.encode(replaceStr, "utf-8"));

        }

        return returnStr;

    }

    /**

     * 获取会话的JSESSIONID

     *

     * @param url

     * @return

     */

    public static String getSession(String url) {

        String sessionId = "";

        try {

            urlModel = new URL(url);

            httpURLConnection = (HttpURLConnection) urlModel.openConnection();

            httpURLConnection.setConnectTimeout(connectTimeout);

            httpURLConnection.setReadTimeout(readTimeout);

            String cookieVal = null;

            String key = null;

            for (int i = ; (key = httpURLConnection.getHeaderFieldKey(i)) != null; i++) {

                if (key.equalsIgnoreCase("set-cookie")) {

                    cookieVal = httpURLConnection.getHeaderField(i);

                    cookieVal = cookieVal.substring(, cookieVal.indexOf(";"));

                    sessionId = sessionId + cookieVal + ";";

                }

            }

        } catch (MalformedURLException e) {

            LOG.error(e);

        } catch (IOException e) {

            LOG.error(e);

        }

        return sessionId;

    }

    /**

     * 下载页面</br>

     *

     * @param page

     *            </br>

     * @return 页面源码

     * @throws IOException

     * @throws UnsupportedEncodingException

     */

    public static StringBuffer downloadHtml(String url,String encoding) {

        StringBuffer sb = new StringBuffer();

        BufferedReader in = null;

        int tryNum = ;

        while (true) {

            try {

                if (tryNum > ) {

                    String ecodingUrl = encodParamters(url);

                    urlModel = new URL(ecodingUrl);

                } else {

                    urlModel = new URL(url);

                }

                httpURLConnection = (HttpURLConnection) urlModel

                        .openConnection();

                httpURLConnection.setConnectTimeout(connectTimeout);

                httpURLConnection.setReadTimeout(readTimeout);

                httpURLConnection

                        .setRequestProperty("User-Agent",

                                "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");

                String redirectUrl = httpURLConnection.getURL().toString();

                if (!redirectUrl.equals(url)) {

                    LOG.info(url + "重定向后为" + redirectUrl);

                }

                String charSetHeader = httpURLConnection

                        .getHeaderField("Content-Type");

                String charSet = null;

                if (charSetHeader != null) {

                    Pattern p = Pattern.compile("charset=[\"']?(.*?)['\"]");

                    Matcher m = p.matcher(charSetHeader);

                    if (m.find())

                        charSet = m.group().trim();

                    if (null == charSet) {

                        charSet = encoding;

                    }

                }

                charSet = (charSet == null ? encoding : charSet);

                in = new BufferedReader(new InputStreamReader(

                        httpURLConnection.getInputStream(), charSet));

                String inputLine;

                while ((inputLine = in.readLine()) != null) {

                    sb.append(inputLine + "\n");

                    inputLine = null;

                }

                if (in != null)

                    try {

                        in.close();

                    } catch (IOException e) {

                        LOG.error(e);

                    }

                if (httpURLConnection != null)

                    httpURLConnection.disconnect();

                break;

            } catch (Exception e) {

                if (tryNum++ == ) {

                    LOG.error("download page error [ " + urlModel + " ] ");

                    return null;

                }

                LOG.warn(tryNum + "次下载失败", e);

            }

        }

        return sb;

    }

    /**

     * 下载页面</br>

     *

     * @param page

     *            </br>

     * @return 页面源码

     * @throws IOException

     * @throws UnsupportedEncodingException

     */

    public static StringBuffer downloadHtml(String url) {

        StringBuffer sb = new StringBuffer();

        BufferedReader in = null;

        int tryNum = ;

        while (true) {

            try {

                if (tryNum > ) {

                    String ecodingUrl = encodParamters(url);

                    urlModel = new URL(ecodingUrl);

                } else {

                    urlModel = new URL(url);

                }

                httpURLConnection = (HttpURLConnection) urlModel

                        .openConnection();

                httpURLConnection.setConnectTimeout(connectTimeout);

                httpURLConnection.setReadTimeout(readTimeout);

                httpURLConnection

                        .setRequestProperty("User-Agent",

                                "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)");

                String redirectUrl = httpURLConnection.getURL().toString();

                if (!redirectUrl.equals(url)) {

                    LOG.info(url + "重定向后为" + redirectUrl);

                }

                String charSetHeader = httpURLConnection

                        .getHeaderField("Content-Type");

                String charSet = null;

                if (charSetHeader != null) {

                    Pattern p = Pattern.compile("charset=[\"']?(.*?)['\"]");

                    Matcher m = p.matcher(charSetHeader);

                    if (m.find())

                        charSet = m.group().trim();

                    if (null == charSet) {

                        charSet = defaultEncoding;

                    }

                }

                charSet = (charSet == null ? defaultEncoding : charSet);

                in = new BufferedReader(new InputStreamReader(

                        httpURLConnection.getInputStream(), charSet));

                String inputLine;

                while ((inputLine = in.readLine()) != null) {

                    sb.append(inputLine + "\n");

                    inputLine = null;

                }

                if (in != null)

                    try {

                        in.close();

                    } catch (IOException e) {

                        LOG.error(e);

                    }

                if (httpURLConnection != null)

                    httpURLConnection.disconnect();

                break;

            } catch (Exception e) {

                if (tryNum++ == ) {

                    LOG.error("download page error [ " + urlModel + " ] ");

                    return null;

                }

                LOG.warn(tryNum + "次下载失败", e);

            }

        }

        return sb;

    }

}

crawler_基础之_java.net.HttpURLConnection 访问网络资源的更多相关文章

使用VC建立网络连接并访问网络资源
目录 1. 提出问题 2. 解决方案 1. 提出问题在windows下可以通过系统操作,将局域网的资源映射到本地,从而实现像本地数据一样访问网络资源.实际上这些步骤也可通过代码调用win32函数实现 ...
java成神之——HttpURLConnection访问api
HttpURLConnection 访问get资源访问post资源访问Delete资源获取状态码结语 HttpURLConnection 访问get资源 HttpURLConnection c ...
【CUDA 基础】4.3 内存访问模式
title: [CUDA 基础]4.3 内存访问模式 categories: - CUDA - Freshman tags: - 内存访问模式 - 对齐 - 合并 - 缓存 - 结构体数组 - 数组结 ...
crawler_基础之_httpclient 访问网络资源
先粘贴一个简单版的,后期再修改 pom文件 <dependency> <groupId>org.apache.httpcomponents</groupId> & ...
简单使用URLConnection、HttpURLConnection和HttpClient访问网络资源
URL的openConnection方法将返回一个URLConnection,该对象表示应用程序和URL之间的通信连接.程序可以通过它的实例向该URL发送请求,读取URL引用的资源. 下面通过一个简单 ...
Java多线程基础——对象及变量并发访问
在开发多线程程序时,如果每个多线程处理的事情都不一样,每个线程都互不相关,这样开发的过程就非常轻松.但是很多时候,多线程程序是需要同时访问同一个对象,或者变量的.这样,一个对象同时被多个线程访问,会出 ...
关于安卓开发当中通过java自带的HttpURLConnection访问XML的java.io.EOFException问题
刚接触安卓开发,试着写个小程序熟悉下,就写了天气预报的小程序,通过httpUrlConnection读流的方式来获取网络公共接口提供的天气XML信息.但在建立http连接时一直报java.io.EOF ...
通过HTTP访问网络资源
添加访问网络的权限:<uses-permission android:name="android.permission.INTERNET"/> package com. ...
Android网络:HTTP之利用HttpURLConnection访问网页、获取网络图片实例 (附源码)
http://blog.csdn.net/yanzi1225627/article/details/22222735 如前文所示的TCP局域网传送东西,除了对传输层的TCP/UDP支持良好外,Andr ...

随机推荐

Sonar安装与使用说明
我总结的Sonar安装与使用说明,需要的可以去网盘下载. 网盘地址: http://pan.baidu.com/s/199BII
Search Bars（一个）
A search bar provides an interface for text-based searches with a text box and buttons such as searc ...
cocos2dx环境配置和打包
安装软件准备就绪: vs2012 cocos2d-x-2.2.1 adt-bundle-windows-x86_64-20121030 android-ndk-r9c-windows-x86_64 j ...
【Android进阶】Android调用WebService的实现
最近想自己搞搞服务器,就从最简单的webservice开始吧先上效果图项目结构开始贴代码,注释都有,有问题的请留言 MainActivity.java package com.example.w ...
JList用法小结
JList用法小结分类: JAVA技术2007-08-11 01:02 18485人阅读评论(11) 收藏举报 stringvectorclassjavaactionobject ...
[站点部署_01]wordpress建站网页响应速度慢
最近可能非常多人发现站点打开速度变慢.这里分享一下该问题的定位方法. 我在本地部署了一个wordpress站点,近几天突然发现站点訪问速度奇慢,实在不能忍.于是採用例如以下方法攻克了这个问题: 1)使 ...
七牛对用户使用webp图片格式的使用建议
Qiniu 七牛问题解答 Chrome浏览器是可打开WebP格式的.可是并非全部的浏览器都支持webp格式,比如360.ie等浏览器是不支持的. WebP格式,谷歌(google)开发的一种旨在加快图 ...
BestCoder Round #16
BestCoder Round #16 题目链接这场挫掉了,3挂2,都是非常sb的错误 23333 QAQ A:每一个数字.左边个数乘上右边个数,就是能够组成的区间个数,然后乘的过程注意取模不然会爆 ...
[创意标题] spoj 11354 Amusing numbers
意甲冠军: 给k(1<=k<=10^15),先询问k 大只包含数字5和6的数目是多少实例 1那是,5 ,3那是,55 .4那是,56 思考: 首先,我们可以找到.有许多2这是头号,有两个 ...
SQL Server 2008性能故障排查（二）——CPU
原文:SQL Server 2008性能故障排查(二)--CPU 承接上一篇:SQL Server 2008性能故障排查(一)--概论说明一下,CSDN的博客编辑非常不人性化,我在word里面都排好 ...

crawler_基础之_java.net.HttpURLConnection 访问网络资源

crawler_基础之_java.net.HttpURLConnection 访问网络资源的更多相关文章

随机推荐

热门专题