使用Jsoup 爬取网易首页所有的图片

package com.enation.newtest;

import java.io.File;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.net.HttpURLConnection;

import java.net.URL;

import java.net.URLConnection;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.commons.lang3.StringEscapeUtils;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

// 爬取网易首页所有图片

public class Jsoup163 {

    public static void main(String[] args) throws Exception{

        String downloadPath = "D:\\360Downloads\\test";

        List<String> list = nameList("网易--首页");

        getPictures(list,1,downloadPath); //1代表下载一页，一页一般有30张图片

    }

    public static void getPictures(List<String> keywordList, int max,String downloadPath) throws Exception{ // key为关键词,max作为爬取的页数

        String gsm=Integer.toHexString(max)+"";

        String finalURL = "";

        String tempPath = "";

        for(String keyword : keywordList){

            tempPath = downloadPath;

            if(!tempPath.endsWith("\\")){

                       tempPath = downloadPath+"\\";

            }

            tempPath = tempPath+keyword+"\\";

            File f = new File(tempPath);

            if(!f.exists()){

                f.mkdirs();

            }

            int picCount = 1;

            for(int page=1;page<=max;page++) {

                sop("正在下载第"+page+"页面");

                Document document = null;

                try {

                    String url ="http://www.163.com/";

                    sop(url);

                    document = Jsoup.connect(url).data("query", "Java")//请求参数

                             .userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")//设置urer-agent  get();

                             .timeout(5000)

                             .get();

                    String xmlSource = document.toString();

                    xmlSource = StringEscapeUtils.unescapeHtml3(xmlSource);

                    //sop(xmlSource);

                    String reg = "<img.*src=(.*?)[^>]*?>";

                    String reg2 = "src\\s*=\\s*\"?(.*?)(\"|>|\\s+)";

                    String reg2datasrc = "data-src\\s*=\\s*\"?(.*?)(\"|>|\\s+)";

                    Pattern pattern = Pattern.compile(reg);

                    Pattern pattern2 = Pattern.compile(reg2);

                    Pattern pattern2datasrc = Pattern.compile(reg2datasrc);

                    Matcher m = pattern.matcher(xmlSource);

                    while (m.find()){

                        finalURL = m.group();

                        System.out.println(finalURL);

                        Matcher m2 = null;

                        if(finalURL.indexOf("data-src")>0){

                            m2 = pattern2datasrc.matcher(finalURL);

                        }else {

                            m2 = pattern2.matcher(finalURL);

                        }

                        if(m2.find()){

                            finalURL = m2.group(1);

                            System.out.println(finalURL);

                            if(finalURL.startsWith("http")){

                                sop(keyword+picCount+++":"+finalURL);

                                download(finalURL,tempPath);

                                sop("             下载成功");

                            }

                        }

                    }

                } catch (IOException e) {

                    e.printStackTrace();

                }

            }

       }

       sop("下载完毕");

       delMultyFile(downloadPath);

       sop("已经删除所有空图");

    }

    public static void delMultyFile(String path){

        File file = new File(path);

        if(!file.exists())

            throw new RuntimeException("File \""+path+"\" NotFound when excute the method of delMultyFile()....");

        File[] fileList = file.listFiles();

        File tempFile=null;

        for(File f : fileList){

            if(f.isDirectory()){

                delMultyFile(f.getAbsolutePath());

            }else{

                if(f.length()==0)

                    sop(f.delete()+"---"+f.getName());

            }

        }

    }

    public static List<String> nameList(String nameList){

        List<String> arr = new ArrayList<String>();

        String[] list;

        if(nameList.contains(","))

            list= nameList.split(",");

        else if(nameList.contains("、"))

            list= nameList.split("、");

        else if(nameList.contains(" "))

            list= nameList.split(" ");

        else{

            arr.add(nameList);

            return arr;

        }

        for(String s : list){

            arr.add(s);

        }

        return arr;

    }

    public static void sop(Object obj){

        System.out.println(obj);

    }

    //根据图片网络地址下载图片

      public static void download(String url,String path){

          //path = path.substring(0,path.length()-2);

          File file= null;

          File dirFile=null;

          FileOutputStream fos=null;

          HttpURLConnection httpCon = null;

          URLConnection  con = null;

          URL urlObj=null;

          InputStream in =null;

          byte[] size = new byte[1024];

          int num=0;

          try {

              String downloadName= url.substring(url.lastIndexOf("/")+1);

              dirFile = new File(path);

              if(!dirFile.exists() && path.length()>0){

                  if(dirFile.mkdir()){

                      sop("creat document file \""+path.substring(0,path.length()-1)+"\" success...\n");

                  }

              }else{

                  file = new File(path+downloadName);

                  fos = new FileOutputStream(file);

                  if(url.startsWith("http")){

                      urlObj = new URL(url);

                      con = urlObj.openConnection();

                      httpCon =(HttpURLConnection) con;

                      int  responseCode = httpCon.getResponseCode();

                      if(responseCode == 200){

                          in = httpCon.getInputStream();

                          while((num=in.read(size)) != -1){

                              for(int i=0;i<num;i++)

                                  fos.write(size[i]);

                          }

                      }else {

                        System.out.println("状态码："+responseCode+" 地址："+url);

                    }

                  }

              }

          }catch (FileNotFoundException notFoundE) {

              sop("找不到该网络图片....");

          }catch(NullPointerException nullPointerE){

              sop("找不到该网络图片....");

          }catch(IOException ioE){

              sop("产生IO异常.....");

          }catch (Exception e) {

              e.printStackTrace();

          }finally{

              try {

                  if(fos!=null){

                      fos.close();

                  }

              } catch (Exception e) {

                  e.printStackTrace();

              }

          }

      }

}

其中，关键点在于获取图片img标签的正则表达式和图片的链接地址

String reg = "<img.*src=(.*?)[^>]*?>";
String reg2 = "src\\s*=\\s*\"?(.*?)(\"|>|\\s+)";

运行结果：

使用Jsoup 爬取网易首页所有的图片的更多相关文章

Jsoup爬取带登录验证码的网站
今天学完爬虫之后想的爬一下我们学校的教务系统,可是发现登录的时候有验证码.因此研究了Jsoup爬取带验证码的网站: 大体的思路是:(需要注意的是__VIEWSTATE一直变化,所以我们每个页面都需要重 ...
Python爬虫实战教程：爬取网易新闻
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者: Amauri PS:如有需要Python学习资料的小伙伴可以加点击 ...
如何利用python爬取网易新闻
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者: LSGOGroup PS:如有需要Python学习资料的小伙伴可以 ...
jsoup爬取某网站安全数据
jsoup爬取某网站安全数据 package com.vfsd.net; import java.io.IOException; import java.sql.SQLException; impor ...
Python爬虫实战教程：爬取网易新闻；爬虫精选高手技巧
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. stars声明很多小伙伴学习Python过程中会遇到各种烦恼问题解决不了.为 ...
初识python 之爬虫：爬取某网站的壁纸图片
用到的主要知识点:requests.get 获取网页HTMLetree.HTML 使用lxml解析器解析网页xpath 使用xpath获取网页标签信息.图片地址request.urlretrieve ...
python连续爬取多个网页的图片分别保存到不同的文件夹
python连续爬取多个网页的图片分别保存到不同的文件夹作者:vpoet mail:vpoet_sir@163.com #coding:utf-8 import urllib import ur ...
Python爬取贴吧中的图片
#看到贴吧大佬在发图,准备盗一下 #只是爬取一个帖子中的图片 1.先新建一个scrapy项目 scrapy startproject TuBaEx 2.新建一个爬虫 scrapy genspider ...
Python 爬取煎蛋网妹子图片
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Date : 2017-08-24 10:17:28 # @Author : EnderZhou (z ...

随机推荐

SDK,monkey 浅谈
最近在工作之余碰到一些手机测试的新手,现在测试手机的基本都是android的系统. 然后在遇到压力测试的时候就开始遇到问题了. 压力测试用什么工具?怎么使用?工具怎么来? 今天遇到两个人都问我SDK是 ...
使用 BeanCopier 复制对象
Cglib是一款比较底层的操作java字节码的框架. BeanCopier是一个工具类,可以用于Bean对象内容的复制. 复制Bean对象内容的方法有很多,比如自己手动get set ,或者使用Pro ...
安装 composer SSL operation failed with code 1
gavin@webdev:~> curl -sS https://getcomposer.org/installer | php Downloading... Download failed: ...
AngularJs自定义指令详解（1） - restrict
下面所有例子都使用angular-1.3.16.下载地址:http://cdn.bootcss.com/angular.js/1.3.16/angular.min.js 既然AngularJs快要发布 ...
20145225 《Java程序设计》第2周学习总结
20145225<Java程序设计> 第2周学习总结教材学习内容总结 3.1.1Java的类型分为基本类型(Primitive type)和类类型(Class type) 基本类型: ...
解决AndroidADT自带Eclipse编辑器不能自动代码提示的问题
今天发现,我下载的AndroidADT开发套装中自带的Eclipse没有自动代码提示功能.通过参考http://blog.csdn.net/coolszy/article/details/724195 ...
C语言extern作用（全局变量）
用C语言编写程序的时候,我们经常会遇到这样一种情况:希望在头文件中定义一个全局变量,然后包含到两个不同的c文件中,希望这个全局变量能在两个文件中共用. 举例说明:项目文件夹project下有main. ...
(原创)基于FPGA的调光流水灯(Verilog，CPLD/FPGA)
1.Abstract 前几天做了一个呼吸灯,觉得确实挺有意思的:可惜的是只有一个灯管亮,板子上有四个灯,要是能让这些灯有序地亮起来,那应该更有趣味了!跟传统的一样,逻辑上做成一个流水灯的样式, ...
haskell中的monad
monad本意是单子.在haskell中,第一个接触的基本都是IO action,通过把IO动作包装起来我们能很方便的与现实世界进行数据交换.但其实monad的用途不止如此,monad还能讲一系列操作 ...
Makecert.exe（证书创建工具）
Makecert.exe(证书创建工具) .NET Framework 4.5 其他版本 2(共 3)对本文的评价是有帮助 - 评价此主题证书创建工具生成仅用于测试目的的 X.509 证 ...

使用Jsoup 爬取网易首页所有的图片

使用Jsoup 爬取网易首页所有的图片的更多相关文章

随机推荐

热门专题