一、网页图片爬取类

package com.yhyl.utils;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Component; import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; /**
* @program: springboot-sqlserver-elasticsearch-api
* @description
* @author: xbwen
* @create: 2021-07-22 16:40
**/
@Component
public class HtmlParseUtil {
public static void main(String[] args) throws IOException {
String url = "https://www.yeitu.com/meinv/xinggan/20180919_14722.html";
Set<String> urlList = new HashSet<>();
urlList.add(url);
ExecutorService executorService = Executors.newFixedThreadPool(24);
Set<String> parseUrlList = new HtmlParseUtil().parseUrl(urlList);
Set<String> dataList = new HtmlParseUtil().parseData(parseUrlList,executorService);
executorService.shutdown();
} //解析网页地址,递归模拟浏览器网页请求,获取自己想要得页面数
public Set<String> parseUrl(Set<String> urlList) throws IOException{
if(urlList.size()>=50){
return urlList;
}
Set<String> tempList = new HashSet<>();
for (String url : urlList) {
Document document = Jsoup.parse(new URL(url), 30000);
Elements elements = document.getElementsByTag("a");
for (Element element : elements) {
String href = element.attr("href");
if(href.endsWith(".html")){
tempList.add(href);
}
}
}
urlList.addAll(tempList);
urlList = parseUrl(urlList);
return urlList;
} //根据网页地址,模拟浏览器请求,获取页面图片链接地址,并下载到本地磁盘
public Set<String> parseData(Set<String> urlList, ExecutorService executorService){
Set<String> contents = new HashSet<>();
for (String url : urlList) {
executorService.execute(new Thread() {
@Override
public synchronized void run(){
try {
Document document = Jsoup.parse(new URL(url), 30000);
Elements imgBoxElements = document.getElementsByClass("img_box");
for (Element element : imgBoxElements) {
Elements imgElements = element.getElementsByTag("img");
for (Element imgElement : imgElements) {
String title = imgElement.attr("alt");
String img = imgElement.attr("src");
System.out.println("下载完成:"+title+"@"+img);
// HtmlParseUtil htmlParseUtilProxy = SpringContextHolder.getBean(HtmlParseUtil.class);
downloadPicture(title,img,"E:\\picture\\"+title+".jpg");
// contents.add(title+"@"+img);
}
}
}catch (Exception e){
e.printStackTrace();
} }
});
}
return contents;
} //链接url下载图片
@Async
public void downloadPicture(String imgTitle, String imgUrl, String path) throws Exception{
URL url = new URL(imgUrl);
// 打开连接
URLConnection conn = url.openConnection();
// HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setConnectTimeout(50000);
conn.setReadTimeout(50000);
// conn.setRequestProperty(":authority", "file.jiutuvip.com");
// conn.setRequestProperty(":method", "GET");
// conn.setRequestProperty(":path", "/2021/0105/20210105101307685.jpg");
// conn.setRequestProperty(":scheme","https");
conn.setRequestProperty("accept","image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8");
conn.setRequestProperty("accept-encoding", "gzip, deflate, br");
conn.setRequestProperty("accept-language", "zh-CN,zh;q=0.9");
conn.setRequestProperty("if-modified-since", "Tue, 05 Jan 2021 02:13:07 GMT");
conn.setRequestProperty("if-none-match", "5ff3cb33-6aa5");
conn.setRequestProperty("referer", "https://www.yeitu.net/");
conn.setRequestProperty("sec-ch-ua", " Not;A Brand;"+"v=\"99\", \"Google Chrome;"+"v=\"91\", \"Chromium;"+"v=\"91\"");
conn.setRequestProperty("sec-ch-ua-mobile", "?0");
conn.setRequestProperty("sec-fetch-dest", "image");
conn.setRequestProperty("sec-fetch-mode", "no-cors");
conn.setRequestProperty("sec-fetch-site", "cross-site");
conn.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36");
conn.connect(); // List<Object> objects = new ArrayList<>();
// 输入流
InputStream is = conn.getInputStream();
// objects.add(Base64Utils.encode(imgTitle.getBytes()));
// objects.add(is);
// jdbcTemplate.update("insert into image(image_title,image_stream) values (?,?)",objects.toArray());
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
// String filename = "D:\\图片下载/" + i + ".jpg"; //下载路径及下载图片名称
File file = new File(path);
FileOutputStream os = new FileOutputStream(file, true);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
} }

二、POM依赖文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.4.5</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.yhyl</groupId>
<artifactId>springboot-14-leetcode</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>springboot-14-leetcode</name>
<description>Demo project for Spring Boot</description>
<properties>
<java.version>1.8</java.version>
<spring-cloud.version>2020.0.2</spring-cloud.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-netflix-eureka-server</artifactId>
</dependency> <dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.20</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>com.zaxxer</groupId>
<artifactId>HikariCP</artifactId>
</dependency>
<dependency>
<groupId>com.microsoft.sqlserver</groupId>
<artifactId>mssql-jdbc</artifactId>
<version>8.4.1.jre8</version>
</dependency>
<!-- JDBC -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
</dependencies>
<!--<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-dependencies</artifactId>
<version>${spring-cloud.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>--> </project>

三、SpringBoot上下文配置

package com.yhyl.utils;

import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware; /**
* @author Jie
* @date 2019-01-07
*/
@Slf4j
public class SpringContextHolder implements ApplicationContextAware, DisposableBean { private static ApplicationContext applicationContext = null; /**
* 从静态变量applicationContext中取得Bean, 自动转型为所赋值对象的类型.
*/
@SuppressWarnings("unchecked")
public static <T> T getBean(String name) {
assertContextInjected();
return (T) applicationContext.getBean(name);
} /**
* 从静态变量applicationContext中取得Bean, 自动转型为所赋值对象的类型.
*/
public static <T> T getBean(Class<T> requiredType) {
assertContextInjected();
return applicationContext.getBean(requiredType);
} /**
* 检查ApplicationContext不为空.
*/
private static void assertContextInjected() {
if (applicationContext == null) {
throw new IllegalStateException("applicaitonContext属性未注入, 请在applicationContext" +
".xml中定义SpringContextHolder或在SpringBoot启动类中注册SpringContextHolder.");
}
} /**
* 清除SpringContextHolder中的ApplicationContext为Null.
*/
private static void clearHolder() {
log.debug("清除SpringContextHolder中的ApplicationContext:"
+ applicationContext);
applicationContext = null;
} @Override
public void destroy(){
SpringContextHolder.clearHolder();
} @Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
if (SpringContextHolder.applicationContext != null) {
log.warn("SpringContextHolder中的ApplicationContext被覆盖, 原有ApplicationContext为:" + SpringContextHolder.applicationContext);
}
SpringContextHolder.applicationContext = applicationContext;
}
}

四、SpringBoot启动类

@SpringBootApplication
@EnableAsync
public class LeectCodeApplication {
public static void main(String[] args) {
SpringApplication.run(LeectCodeApplication.class, args);
}
@Bean
public SpringContextHolder springContextHolder() {
return new SpringContextHolder();
} }

五、执行结果

异步、多线程、Java爬取某网站图片的更多相关文章

  1. Java爬虫实践--爬取CSDN网站图片为例

    实现的效果,自动在工程下创建Pictures文件夹,根据网站URL爬取图片,层层获取.在Pictures下以网站的层级URL命名文件夹,用来装该层URL下的图片.同时将文件名,路径,URL插入数据库, ...

  2. python3 urllib爬取wallhalla网站图片

    点我去我的github上看源码 简单使用静态方法爬取https://wallhalla.com/网站的图片 参考: https://blog.csdn.net/cquptcmj/article/det ...

  3. 使用nodejs+http(s)+events+cheerio+iconv-lite爬取2717网站图片数据到本地文件夹

    源代码如下:   //(node:9240) Warning: Setting the NODE_TLS_REJECT_UNAUTHORIZED environment variable to '0' ...

  4. python3爬取动态网站图片

    思路: 1.图片放在<image>XXX</image>标签中 2.利用fiddler抓包获取存放图片信息的js文件url 3.利用requests库获取html内容,然后获取 ...

  5. 实战爬取某网站图片-Python

    直接上代码 1 #!/usr/bin/python 2 # -*- coding: UTF-8 -*- 3 from bs4 import BeautifulSoup 4 import request ...

  6. Scrapy 爬取某网站图片

    1. 创建一个 Scrapy 项目,在命令行或者 Pycharm 的 Terminal 中输入: scrapy startproject imagepix 自动生成了下列文件: 2. 在 imagep ...

  7. python爬取某个网站的图片并保存到本地

    python爬取某个网站的图片并保存到本地 #coding:utf- import urllib import re import sys reload(sys) sys.setdefaultenco ...

  8. 初识python 之 爬虫:爬取某网站的壁纸图片

    用到的主要知识点:requests.get 获取网页HTMLetree.HTML 使用lxml解析器解析网页xpath 使用xpath获取网页标签信息.图片地址request.urlretrieve ...

  9. Python多线程爬取某网站表情包

    # 爬取网络图片import requestsfrom lxml import etreefrom urllib import requestfrom queue import Queue # 导入队 ...

  10. 使用Python爬虫爬取网络美女图片

    代码地址如下:http://www.demodashi.com/demo/13500.html 准备工作 安装python3.6 略 安装requests库(用于请求静态页面) pip install ...

随机推荐

  1. 【Azure Fabric Service】怎样关闭 Azure Service Fabric?

    问题描述 怎样关闭Azure Service Fabric服务呢?在Azure门户上没有找到 Stop 按钮. 问题回答 Azure Service Fabric 默认是无法停止的,可以删除. 虽然可 ...

  2. 《A Hierarchical Framework for Relation Extraction with Reinforcement Learning》论文阅读笔记

    代码 原文地址 摘要 现有的大多数方法在确定关系类型之前,需要先识别出所有的实体,这样就忽略了实体提及和关系类型之间的交互.本文提出了一种新颖的联合抽取范式,把相关实体看作是关系的参数( 首先检测一个 ...

  3. 在winform中如何嵌入第三方软件窗体✨

    相关win32api的学习 SetParent [DllImport("user32.dll ", EntryPoint = "SetParent")] pri ...

  4. IDEA使用与多线程

    IDEA缩写和快捷键 psvm全称public static void main sout 全称public static void main alt+enter 处理异常 s.out 自动打印s c ...

  5. [VueJsDev] 快速入门 - 开发前小知识

    [VueJsDev] 目录列表 https://www.cnblogs.com/pengchenggang/p/17037320.html 开发前小知识 ::: details 目录 目录 开发前小知 ...

  6. .vscode/extensions.json 是项目用到的 插件 推荐列表,项目应该将此配置 写入用到的插件

    .vscode/extensions.json 是项目用到的 插件 推荐列表,项目应该将此配置 写入用到的插件 .vscode/extensions.json { "recommendati ...

  7. html添加css样式的两种方法

      html添加css样式有三种方法,分别为行内式(使用style属性,在特定的HTML标签内使用).内嵌式(style标签把css代码放在特定页面的head部分中).外联式(使用link标签,将外部 ...

  8. 精通 Grails: 用 Groovy 服务器页面(GSP)改变视图

    Groovy 服务器页面(Groovy Server Pages,GSP)将 Web 置于 Grails Web 框架之内.在 精通 Grails 系列的第三期中,Scott Davis 介绍了如何使 ...

  9. 展会回顾 | 2023元宇宙生态博览会圆满落幕,3DCAT荣获“元宇宙交互技术奖”

    2023年5月10日-5月12日,一场涵盖了元宇宙终端头显.数字文娱.数字艺术.数字运动.数字多媒体展陈设计.数字展厅展馆.科技文旅.夜游演艺.沉浸式KTV/酒吧等多个领域的元宇宙商业盛会--2023 ...

  10. django项目(博客二)

    扩展1:admin路由分发的本质 路由分发本质 include 可以无限制的 嵌套N多层 url(r'^index/',([],None,None)) 扩展2: 由于url方法第一个参数是正则表达式, ...