httpClient爬虫

 package httpClient.client;

 import java.io.File;

 import java.io.IOException;

 import java.io.InputStream;

 import java.util.UUID;

 import org.apache.commons.io.FileUtils;

 import org.apache.http.HttpEntity;

 import org.apache.http.client.ClientProtocolException;

 import org.apache.http.client.methods.CloseableHttpResponse;

 import org.apache.http.client.methods.HttpGet;

 import org.apache.http.impl.client.CloseableHttpClient;

 import org.apache.http.impl.client.HttpClients;

 import org.apache.http.util.EntityUtils;

 import org.jsoup.Jsoup;

 import org.jsoup.nodes.Document;

 import org.jsoup.nodes.Element;

 import org.jsoup.select.Elements;

 public class HttpClinet {

     public static void main(String[] args) throws ClientProtocolException, IOException {

         // 图片路径

         String url = "https://www.mzitu.com/";

         // 创建httpClient实例

         CloseableHttpClient httpClient = HttpClients.createDefault();

         HttpClinet t = new HttpClinet();

         HttpEntity httpEntity = t.getEntity(httpClient, url);

         String html = EntityUtils.toString(httpEntity, "UTF-8");

         Document document = Jsoup.parse(html);

         // 像js一样，通过标签获取title

         // System.out.println(document.getElementsByTag("title").first());

         // 像js一样，通过id 获取文章列表元素对象

         Element postList = document.getElementById("pins");

         // 像js一样，通过class 获取列表下的所有博客

         Elements postItems = postList.select("li a");

         // 循环处理每篇博客

         String s = "0";

         for (Element postItem : postItems) {

             String urls = postItem.attr("href").trim();

             if (!s.equals(urls)) {

                 s = urls;

                 HttpEntity httpEntitys = t.getEntity(httpClient, urls);

                 String htmls = EntityUtils.toString(httpEntitys, "UTF-8");

                 Document documents = Jsoup.parse(htmls);

                 String postLists = documents.getElementsByClass("main-image").first().select("p a img").attr("src");

                 if (postLists != null) {

                     System.out.println(postLists);

                     t.save(postLists, httpClient);

                 }

             }

         }

         t.close(httpClient);

     }

     public void save(String url, CloseableHttpClient httpClient) throws ClientProtocolException, IOException {

         String fileName = url.substring(url.lastIndexOf("."), url.length());

         HttpEntity entity = this.getEntity(httpClient, url); // 获取返回实体

         if (entity != null) {

             System.out.println("Content-Type:" + entity.getContentType().getValue());

             InputStream inputStream = entity.getContent();

             // 文件复制，common io 包下，需要 引入依赖

             FileUtils.copyToFile(inputStream, new File(UUID.randomUUID() + fileName));

         }

     }

     public void close(CloseableHttpClient httpClient) throws IOException {

         if (httpClient != null) {

             httpClient.close();

         }

     }

     public HttpEntity getEntity(CloseableHttpClient httpClient, String url) throws ClientProtocolException, IOException {

         HttpGet httpGet = new HttpGet(url);

         httpGet.setHeader("If-None-Match", "W/\"5cc2cd8f-2c58");

         httpGet.setHeader("Referer", "http://www.mzitu.com/all/");

         httpGet.setHeader("User-Agent",

                 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36");

         CloseableHttpResponse response = httpClient.execute(httpGet);

         return response.getEntity();

     }

 }

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

  <modelVersion>4.0.0</modelVersion>

  <groupId>httpClient</groupId>

  <artifactId>client</artifactId>

  <version>0.0.1-SNAPSHOT</version>

  <packaging>jar</packaging>

  <name>client</name>

  <url>http://maven.apache.org</url>

  <properties>

    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

  </properties>

  <dependencies>

    <dependency>

      <groupId>junit</groupId>

      <artifactId>junit</artifactId>

      <version>3.8.1</version>

      <scope>test</scope>

    </dependency>

    <dependency>

    <groupId>org.apache.httpcomponents</groupId>

    <artifactId>httpclient</artifactId>

    <version>4.5.8</version>

</dependency>

<dependency>

    <groupId>org.jsoup</groupId>

    <artifactId>jsoup</artifactId>

    <version>1.12.1</version>

</dependency>

  <dependency>

        <groupId>commons-io</groupId>

        <artifactId>commons-io</artifactId>

        <version>2.5</version>

    </dependency>

  </dependencies>

</project>

httpClient爬虫的更多相关文章

pdf.js跨域加载文件
pdf.js一个基于Html的工具类,熟悉pdf.js的朋友们很清楚,pdf.js帮助我们做了很多事.尤其金融类网站会产生很多的报表.需要在线预览.pdf.js绝对是我们的首选本地预览在pdf.j ...
[Java]使用HttpClient实现一个简单爬虫，抓取煎蛋妹子图
第一篇文章,就从一个简单爬虫开始吧. 这只虫子的功能很简单,抓取到”煎蛋网xxoo”网页(http://jandan.net/ooxx/page-1537),解析出其中的妹子图,保存至本地. 先放结果 ...
使用 HttpClient 和 HtmlParser 实现简易爬虫
这篇文章介绍了 HtmlParser 开源包和 HttpClient 开源包的使用,在此基础上实现了一个简易的网络爬虫 (Crawler),来说明如何使用 HtmlParser 根据需要处理 Inte ...
HtmlParser + HttpClient 实现爬虫
简易爬虫的实现 HttpClient 提供了便利的 HTTP 协议访问,使得我们可以很容易的得到某个网页的源码并保存在本地:HtmlParser 提供了如此简便灵巧的类库,可以从网页中便捷的提取出指向 ...
[转]使用 HttpClient 和 HtmlParser 实现简易爬虫
http://www.ibm.com/developerworks/cn/opensource/os-cn-crawler/ http://blog.csdn.net/dancen/article/d ...
HttpClient的使用-爬虫学习1
HttpClient的使用-爬虫学习(一) Apache真是伟大,为我们提供了HttpClient.jar,这个HttpClient是客户端的http通信实现库,这个类库的作用是接受和发送http报文 ...
HttpClient和 HtmlParser实现爬虫
网络爬虫技术 1 什么叫网络爬虫网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本.另外一些不 ...
用HttpClient和用HttpURLConnection做爬虫发现爬取的代码少了的问题
最近在学习用java来做爬虫但是发现不管用那种方式都是爬取的代码比网页的源码少了很多在网上查了很多都说是inputStream的缓冲区太小而爬取的网页太大导致读取出来的网页代码不完整,但是后面发现并不 ...
使用HttpClient和Jsoup实现一个简单爬虫
一直很想了解一下爬虫这个东西的,完全是出于兴趣,其实刚开始是准备用python的,但是由于种种原因选择了java,此处省略很多字... 总之,如果你想做一件事情的话就尽快去做吧,千万不要把战线拉得太长 ...

随机推荐

spring-redis-session 自定义 key 和过期时间
对于分布式应用来说,最开始遇到的问题就是 session 的存储了,解决方案大致有如下几种使用 spring-session 它可以把 session 存储到你想存储的位置,如 redis,mysq ...
vmware安装ubuntu的简单配置
介绍:ubuntu是一个桌面体验比较好的linux操作系统,尝试使用vmware安装一个虚拟机试用一下,做个简单记录,安装操作系统步骤省略一.配置root用户,并使用root登录图像界面 Ubunt ...
react 报红错误汇总
react 报红错误汇总一.Uncaught TypeError: Cannot read property 'value' of undefined 未知类型错:无法读取未定义的属性“value ...
windows生成github密钥并推送文件踩坑
强调官方文档最可靠,百度踩坑很浪费时间,建议去寻找一手数据源头 github官方文档提供了帮助第一步查看密钥如果您还没有 SSH 密钥,则必须生成新 SSH 密钥. 如果您不确定是否已有 SSH ...
使用PE启动盘清空电脑登入密码
1.PE启动盘制作过程要制作一个启动盘可以使用很多工具来制作,比如老毛桃.U深度.大白菜等软件都可以制作PE启动盘.此处就用老毛桃制作PE启动盘为例(http://www.laomaotao.tv/ ...
Have Fun with Numbers
Notice that the number 123456789 is a 9-digit number consisting exactly the numbers from 1 to 9, wit ...
使用java做一个能赚钱的微信群聊机器人（2020年基于PC端协议最新可用版）
前言微信群机器人,主要用来管理群聊,提供类似天气查询.点歌.机器人聊天等用途. 由于微信将web端的协议封杀后,很多基于http协议的群聊机器人都失效了,所以这里使用基于PC端协议的插件来实现. 声 ...
shell学习-常用语句
为什么使用shell 可以快速.简单的完成编程,实现自己的想法.Shell非常适合编写小的工具,因为小工具更强调的是易于配置.维护.移植等,而不是执行效率. 当自己的想法确实有必要进行优化,有必要让它 ...
PQSQL 按照时间进行分组
按照时间分组时一般是按照年.月.日进行分组,不会把时分秒也算进去,所以需要把时间戳提取出所需要的时间段,本质上是把时间戳格式化成对应形式的字符串,这个过程需要用to_char(timestamp, t ...
rest实践3
1.从mongodb的数据实体Document中获取其中一个字段的值,即例如:doc.getString("pid"),直接显示value. 2.当从网络上的网址url的图片直接弄 ...

httpClient爬虫

httpClient爬虫的更多相关文章

随机推荐

热门专题