Java网络爬虫 HttpClient

简介 : HttpClient是Apache Jakarta Common下的子项目,用于提供高效的,功能丰富的支持HTTP协议的客户编程工具包,其主要功能如下:

实现了所有HTTP的方法 : GET,POST,PUT,HEAD ..
支持自动重定向
支持HTTPS协议
支持代理服务器

关于Http请求的方法说明，参考大佬整理的博客：

https://www.cnblogs.com/williamjie/p/9099940.html

一、环境准备

1 JDK1.8

2 IntelliJ IDEA

3 IDEA自带的Maven

创建Maven工程itcast-crawler-first并给pom.xml加入依赖

关于xml语言的介绍

<dependencies>

    <!-- HttpClient -->

    <dependency>

        <groupId>org.apache.httpcomponents</groupId>

        <artifactId>httpclient</artifactId>

        <version>4.5.3</version>

    </dependency>

    <!-- 日志 -->

    <dependency>

        <groupId>org.slf4j</groupId>

        <artifactId>slf4j-log4j12</artifactId>

        <version>1.7.25</version>

    </dependency>

</dependencies>

关于日志的配置文件

log4j.rootLogger=DEBUG,A1

log4j.logger.cn.itcast = DEBUG

log4j.appender.A1=org.apache.log4j.ConsoleAppender

log4j.appender.A1.layout=org.apache.log4j.PatternLayout

log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n

log4j可以将日志以文件的形式输出，也可以输出打印在控制台上，同时可以设置输出的日志内容显示格式、日志文件的生成方式（追加、覆盖、设置日志文件大小等等）。我这里就是直接将日志打印到控制台上。org.apache.log4j.ConsoleAppender

二、编写代码

编写最简单的爬虫，抓取传智播客首页：http://www.itcast.cn/

public class CrawlerFirst {

    public static void main(String[] args) throws Exception {

        //1. 打开浏览器,创建HttpClient对象

        CloseableHttpClient httpClient = HttpClients.createDefault();

        //2. 输入网址,发起get请求创建HttpGet对象

        HttpGet httpGet = new HttpGet("http://www.itcast.cn");

        //3.按回车，发起请求，返回响应，使用HttpClient对象发起请求

        CloseableHttpResponse response = httpClient.execute(httpGet);

        //4. 解析响应，获取数据

        //判断状态码是否是200

        if (response.getStatusLine().getStatusCode() == 200) {

            HttpEntity httpEntity = response.getEntity();

            String content = EntityUtils.toString(httpEntity, "utf8");

            System.out.println(content);

        }

    }

}

三、Get请求

package cn.itcast.crawler.test;

import org.apache.http.client.methods.CloseableHttpResponse;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.impl.client.HttpClients;

import org.apache.http.util.EntityUtils;

import java.io.IOException;

public class HttpGetTest {

    public static void main(String[] args)  {

        //创建HttpClient对象

        CloseableHttpClient httpClient = HttpClients.createDefault();

        //创建HttpGet对象，设置url访问地址

        HttpGet httpGet = new HttpGet("http://www.itcast.cn");

        CloseableHttpResponse response = null;

        try {

            //使用HttpClient发起请求，获取response

             response = httpClient.execute(httpGet);

            //解析响应

            if (response.getStatusLine().getStatusCode() == 200) {

                String content = EntityUtils.toString(response.getEntity(), "utf8");

                System.out.println(content.length());

            }

        } catch (IOException e) {

            e.printStackTrace();

        }finally {

            //关闭response

            try {

                response.close();

            } catch (IOException e) {

                e.printStackTrace();

            }

            try {

                httpClient.close();

            } catch (IOException e) {

                e.printStackTrace();

            }

        }

    }

}

四、带参数的GET请求

public class HttpGetParamTest {

    public static void main(String[] args) throws Exception {

        //创建HttpClient对象

        CloseableHttpClient httpClient = HttpClients.createDefault();

        //设置请求地址是：http://yun.itheima.com/search?keys=Java

        //创建URIBuilder

        URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");

        //设置参数

        uriBuilder.setParameter("keys","Java");

        //创建HttpGet对象，设置url访问地址

        HttpGet httpGet = new HttpGet(uriBuilder.build());

        System.out.println("发起请求的信息："+httpGet);

        CloseableHttpResponse response = null;

        try {

            //使用HttpClient发起请求，获取response

             response = httpClient.execute(httpGet);

            //解析响应

            if (response.getStatusLine().getStatusCode() == 200) {

                String content = EntityUtils.toString(response.getEntity(), "utf8");

                System.out.println(content.length());

            }

        } catch (IOException e) {

            e.printStackTrace();

        }finally {

            //关闭response

            try {

                response.close();

            } catch (IOException e) {

                e.printStackTrace();

            }

            try {

                httpClient.close();

            } catch (IOException e) {

                e.printStackTrace();

            }

        }

    }

}

五、POST请求

public class HttpPostTest {

    public static void main(String[] args)  {

        //创建HttpClient对象

        CloseableHttpClient httpClient = HttpClients.createDefault();

        //创建HttpPost对象，设置url访问地址

        HttpPost httpPost = new HttpPost("http://www.itcast.cn");

        CloseableHttpResponse response = null;

        try {

            //使用HttpClient发起请求，获取response

             response = httpClient.execute(httpPost);

            //解析响应

            if (response.getStatusLine().getStatusCode() == 200) {

                String content = EntityUtils.toString(response.getEntity(), "utf8");

                System.out.println(content.length());

            }

        } catch (IOException e) {

            e.printStackTrace();

        }finally {

            //关闭response

            try {

                response.close();

            } catch (IOException e) {

                e.printStackTrace();

            }

            try {

                httpClient.close();

            } catch (IOException e) {

                e.printStackTrace();

            }

        }

    }

}

六、带参数的POST请求

public class HttpPostParamTest {

    public static void main(String[] args) throws Exception {

        //创建HttpClient对象

        CloseableHttpClient httpClient = HttpClients.createDefault();

        //创建HttpPost对象，设置url访问地址

        HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");

        //声明List集合，封装表单中的参数

        List<NameValuePair> params = new ArrayList<NameValuePair>();

        //设置请求地址是：http://yun.itheima.com/search?keys=Java

        params.add(new BasicNameValuePair("keys","Java"));

        //创建表单的Entity对象,第一个参数就是封装好的表单数据，第二个参数就是编码

        UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf8");

        //设置表单的Entity对象到Post请求中

        httpPost.setEntity(formEntity);

        CloseableHttpResponse response = null;

        try {

            //使用HttpClient发起请求，获取response

             response = httpClient.execute(httpPost);

            //解析响应

            if (response.getStatusLine().getStatusCode() == 200) {

                String content = EntityUtils.toString(response.getEntity(), "utf8");

                System.out.println(content.length());

            }

        } catch (IOException e) {

            e.printStackTrace();

        }finally {

            //关闭response

            try {

                response.close();

            } catch (IOException e) {

                e.printStackTrace();

            }

            try {

                httpClient.close();

            } catch (IOException e) {

                e.printStackTrace();

            }

        }

    }

}

七、HTTP连接池

如果每次请求都要创建HttpClient，会有频繁创建和销毁的问题，可以使用连接池来解决这个问题。

测试以下代码，并断点查看每次获取的HttpClient都是不一样的。

public class HttpClientPoolTest {

    public static void main(String[] args) {

        //创建连接池管理器

        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();

        //设置最大连接数

        cm.setMaxTotal(100);

        //设置每个主机的最大连接数

        cm.setDefaultMaxPerRoute(10);

        //使用连接池管理器发起请求

        doGet(cm);

        doGet(cm);

    }

    private static void doGet(PoolingHttpClientConnectionManager cm) {

        //不是每次创建新的HttpClient，而是从连接池中获取HttpClient对象

        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

        HttpGet httpGet = new HttpGet("http://www.itcast.cn");

        CloseableHttpResponse response = null;

        try {

            response = httpClient.execute(httpGet);

            if (response.getStatusLine().getStatusCode() == 200) {

                String content = EntityUtils.toString(response.getEntity(), "utf8");

                System.out.println(content.length());

            }

        } catch (IOException e) {

            e.printStackTrace();

        }finally {

            if (response != null) {

                try {

                    response.close();

                } catch (IOException e) {

                    e.printStackTrace();

                }

                //不能关闭HttpClient，由连接池管理HttpClient

                //httpClient.close();

            }

        }

    }

}

八、RequestConfig配置连接信息

在构建网络爬虫时，经常需要配置很多信息，例如RequestTimeout（连接池获取到连接的超时时间）、ConnectTimeout（建立连接的超时）、SocketTimeout（获取数据的超时时间)、代理、是否允许重定向等信息。
在HttpClient，实现这些配置需要使用到RequestConfig类的一个内部类Builder。

如下为Builder的源码如下，代码太长了，我直接折叠了。

    public static class Builder {

        private boolean expectContinueEnabled;

        private HttpHost proxy;

        private InetAddress localAddress;

        private boolean staleConnectionCheckEnabled;

        private String cookieSpec;

        private boolean redirectsEnabled;

        private boolean relativeRedirectsAllowed;

        private boolean circularRedirectsAllowed;

        private int maxRedirects;

        private boolean authenticationEnabled;

        private Collection<String> targetPreferredAuthSchemes;

        private Collection<String> proxyPreferredAuthSchemes;

        private int connectionRequestTimeout;

        private int connectTimeout;

        private int socketTimeout;

        private boolean contentCompressionEnabled;

        Builder() {

            super();

            this.staleConnectionCheckEnabled = false;

            this.redirectsEnabled = true;

            this.maxRedirects = 50;

            this.relativeRedirectsAllowed = true;

            this.authenticationEnabled = true;

            this.connectionRequestTimeout = -1;

            this.connectTimeout = -1;

            this.socketTimeout = -1;

            this.contentCompressionEnabled = true;

        }

        public Builder setExpectContinueEnabled(final boolean expectContinueEnabled) {

            this.expectContinueEnabled = expectContinueEnabled;

            return this;

        }

        public Builder setProxy(final HttpHost proxy) {

            this.proxy = proxy;

            return this;

        }

        public Builder setLocalAddress(final InetAddress localAddress) {

            this.localAddress = localAddress;

            return this;

        }

        /**

         * @deprecated (4.4) Use {@link

         *   org.apache.http.impl.conn.PoolingHttpClientConnectionManager#setValidateAfterInactivity(int)}

         */

        @Deprecated

        public Builder setStaleConnectionCheckEnabled(final boolean staleConnectionCheckEnabled) {

            this.staleConnectionCheckEnabled = staleConnectionCheckEnabled;

            return this;

        }

        public Builder setCookieSpec(final String cookieSpec) {

            this.cookieSpec = cookieSpec;

            return this;

        }

        public Builder setRedirectsEnabled(final boolean redirectsEnabled) {

            this.redirectsEnabled = redirectsEnabled;

            return this;

        }

        public Builder setRelativeRedirectsAllowed(final boolean relativeRedirectsAllowed) {

            this.relativeRedirectsAllowed = relativeRedirectsAllowed;

            return this;

        }

        public Builder setCircularRedirectsAllowed(final boolean circularRedirectsAllowed) {

            this.circularRedirectsAllowed = circularRedirectsAllowed;

            return this;

        }

        public Builder setMaxRedirects(final int maxRedirects) {

            this.maxRedirects = maxRedirects;

            return this;

        }

        public Builder setAuthenticationEnabled(final boolean authenticationEnabled) {

            this.authenticationEnabled = authenticationEnabled;

            return this;

        }

        public Builder setTargetPreferredAuthSchemes(final Collection<String> targetPreferredAuthSchemes) {

            this.targetPreferredAuthSchemes = targetPreferredAuthSchemes;

            return this;

        }

        public Builder setProxyPreferredAuthSchemes(final Collection<String> proxyPreferredAuthSchemes) {

            this.proxyPreferredAuthSchemes = proxyPreferredAuthSchemes;

            return this;

        }

        public Builder setConnectionRequestTimeout(final int connectionRequestTimeout) {

            this.connectionRequestTimeout = connectionRequestTimeout;

            return this;

        }

        public Builder setConnectTimeout(final int connectTimeout) {

            this.connectTimeout = connectTimeout;

            return this;

        }

        public Builder setSocketTimeout(final int socketTimeout) {

            this.socketTimeout = socketTimeout;

            return this;

        }

        /**

         * @deprecated (4.5) Set {@link #setContentCompressionEnabled(boolean)} to {@code false} and

         * add the {@code Accept-Encoding} request header.

         */

        @Deprecated

        public Builder setDecompressionEnabled(final boolean decompressionEnabled) {

            this.contentCompressionEnabled = decompressionEnabled;

            return this;

        }

        public Builder setContentCompressionEnabled(final boolean contentCompressionEnabled) {

            this.contentCompressionEnabled = contentCompressionEnabled;

            return this;

        }

        public RequestConfig build() {

            return new RequestConfig(

                    expectContinueEnabled,

                    proxy,

                    localAddress,

                    staleConnectionCheckEnabled,

                    cookieSpec,

                    redirectsEnabled,

                    relativeRedirectsAllowed,

                    circularRedirectsAllowed,

                    maxRedirects,

                    authenticationEnabled,

                    targetPreferredAuthSchemes,

                    proxyPreferredAuthSchemes,

                    connectionRequestTimeout,

                    connectTimeout,

                    socketTimeout,

                    contentCompressionEnabled);

        }

    }

超时相关配置

HttpClient中可设置三个超时：RequestTimeout（连接池获取到连接的超时时间）、ConnectTimeout（建立连接的超时）、SocketTimeout（获取数据的超时时间）。使用RequestConfig进行配置的示例程序如下：

        //全部设置为10秒

        RequestConfig requestConfig = RequestConfig.custom()

                .setSocketTimeout(10000)

                .setConnectTimeout(10000)

                .setConnectionRequestTimeout(10000)

                .build();

        //配置httpClient

        HttpClient httpClient = HttpClients.custom()

                .setDefaultRequestConfig(requestConfig)

                .build();

代理配置

RequestConfig defaultRequestConfig = RequestConfig.custom()

                .setProxy(new HttpHost("171.97.67.160", 3128, null))

                .build();   //添加代理

HttpClient httpClient = HttpClients.custom().

                setDefaultRequestConfig(defaultRequestConfig).build();  //配置httpClient