异步、多线程、Java爬取某网站图片
一、网页图片爬取类
package com.yhyl.utils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* @program: springboot-sqlserver-elasticsearch-api
* @description
* @author: xbwen
* @create: 2021-07-22 16:40
**/
@Component
public class HtmlParseUtil {
public static void main(String[] args) throws IOException {
String url = "https://www.yeitu.com/meinv/xinggan/20180919_14722.html";
Set<String> urlList = new HashSet<>();
urlList.add(url);
ExecutorService executorService = Executors.newFixedThreadPool(24);
Set<String> parseUrlList = new HtmlParseUtil().parseUrl(urlList);
Set<String> dataList = new HtmlParseUtil().parseData(parseUrlList,executorService);
executorService.shutdown();
}
//解析网页地址,递归模拟浏览器网页请求,获取自己想要得页面数
public Set<String> parseUrl(Set<String> urlList) throws IOException{
if(urlList.size()>=50){
return urlList;
}
Set<String> tempList = new HashSet<>();
for (String url : urlList) {
Document document = Jsoup.parse(new URL(url), 30000);
Elements elements = document.getElementsByTag("a");
for (Element element : elements) {
String href = element.attr("href");
if(href.endsWith(".html")){
tempList.add(href);
}
}
}
urlList.addAll(tempList);
urlList = parseUrl(urlList);
return urlList;
}
//根据网页地址,模拟浏览器请求,获取页面图片链接地址,并下载到本地磁盘
public Set<String> parseData(Set<String> urlList, ExecutorService executorService){
Set<String> contents = new HashSet<>();
for (String url : urlList) {
executorService.execute(new Thread() {
@Override
public synchronized void run(){
try {
Document document = Jsoup.parse(new URL(url), 30000);
Elements imgBoxElements = document.getElementsByClass("img_box");
for (Element element : imgBoxElements) {
Elements imgElements = element.getElementsByTag("img");
for (Element imgElement : imgElements) {
String title = imgElement.attr("alt");
String img = imgElement.attr("src");
System.out.println("下载完成:"+title+"@"+img);
// HtmlParseUtil htmlParseUtilProxy = SpringContextHolder.getBean(HtmlParseUtil.class);
downloadPicture(title,img,"E:\\picture\\"+title+".jpg");
// contents.add(title+"@"+img);
}
}
}catch (Exception e){
e.printStackTrace();
}
}
});
}
return contents;
}
//链接url下载图片
@Async
public void downloadPicture(String imgTitle, String imgUrl, String path) throws Exception{
URL url = new URL(imgUrl);
// 打开连接
URLConnection conn = url.openConnection();
// HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setConnectTimeout(50000);
conn.setReadTimeout(50000);
// conn.setRequestProperty(":authority", "file.jiutuvip.com");
// conn.setRequestProperty(":method", "GET");
// conn.setRequestProperty(":path", "/2021/0105/20210105101307685.jpg");
// conn.setRequestProperty(":scheme","https");
conn.setRequestProperty("accept","image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8");
conn.setRequestProperty("accept-encoding", "gzip, deflate, br");
conn.setRequestProperty("accept-language", "zh-CN,zh;q=0.9");
conn.setRequestProperty("if-modified-since", "Tue, 05 Jan 2021 02:13:07 GMT");
conn.setRequestProperty("if-none-match", "5ff3cb33-6aa5");
conn.setRequestProperty("referer", "https://www.yeitu.net/");
conn.setRequestProperty("sec-ch-ua", " Not;A Brand;"+"v=\"99\", \"Google Chrome;"+"v=\"91\", \"Chromium;"+"v=\"91\"");
conn.setRequestProperty("sec-ch-ua-mobile", "?0");
conn.setRequestProperty("sec-fetch-dest", "image");
conn.setRequestProperty("sec-fetch-mode", "no-cors");
conn.setRequestProperty("sec-fetch-site", "cross-site");
conn.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36");
conn.connect();
// List<Object> objects = new ArrayList<>();
// 输入流
InputStream is = conn.getInputStream();
// objects.add(Base64Utils.encode(imgTitle.getBytes()));
// objects.add(is);
// jdbcTemplate.update("insert into image(image_title,image_stream) values (?,?)",objects.toArray());
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
// String filename = "D:\\图片下载/" + i + ".jpg"; //下载路径及下载图片名称
File file = new File(path);
FileOutputStream os = new FileOutputStream(file, true);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
}
}
二、POM依赖文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.4.5</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.yhyl</groupId>
<artifactId>springboot-14-leetcode</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>springboot-14-leetcode</name>
<description>Demo project for Spring Boot</description>
<properties>
<java.version>1.8</java.version>
<spring-cloud.version>2020.0.2</spring-cloud.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-netflix-eureka-server</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.20</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>com.zaxxer</groupId>
<artifactId>HikariCP</artifactId>
</dependency>
<dependency>
<groupId>com.microsoft.sqlserver</groupId>
<artifactId>mssql-jdbc</artifactId>
<version>8.4.1.jre8</version>
</dependency>
<!-- JDBC -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
</dependencies>
<!--<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-dependencies</artifactId>
<version>${spring-cloud.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>-->
</project>
三、SpringBoot上下文配置
package com.yhyl.utils;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
/**
* @author Jie
* @date 2019-01-07
*/
@Slf4j
public class SpringContextHolder implements ApplicationContextAware, DisposableBean {
private static ApplicationContext applicationContext = null;
/**
* 从静态变量applicationContext中取得Bean, 自动转型为所赋值对象的类型.
*/
@SuppressWarnings("unchecked")
public static <T> T getBean(String name) {
assertContextInjected();
return (T) applicationContext.getBean(name);
}
/**
* 从静态变量applicationContext中取得Bean, 自动转型为所赋值对象的类型.
*/
public static <T> T getBean(Class<T> requiredType) {
assertContextInjected();
return applicationContext.getBean(requiredType);
}
/**
* 检查ApplicationContext不为空.
*/
private static void assertContextInjected() {
if (applicationContext == null) {
throw new IllegalStateException("applicaitonContext属性未注入, 请在applicationContext" +
".xml中定义SpringContextHolder或在SpringBoot启动类中注册SpringContextHolder.");
}
}
/**
* 清除SpringContextHolder中的ApplicationContext为Null.
*/
private static void clearHolder() {
log.debug("清除SpringContextHolder中的ApplicationContext:"
+ applicationContext);
applicationContext = null;
}
@Override
public void destroy(){
SpringContextHolder.clearHolder();
}
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
if (SpringContextHolder.applicationContext != null) {
log.warn("SpringContextHolder中的ApplicationContext被覆盖, 原有ApplicationContext为:" + SpringContextHolder.applicationContext);
}
SpringContextHolder.applicationContext = applicationContext;
}
}
四、SpringBoot启动类
@SpringBootApplication
@EnableAsync
public class LeectCodeApplication {
public static void main(String[] args) {
SpringApplication.run(LeectCodeApplication.class, args);
}
@Bean
public SpringContextHolder springContextHolder() {
return new SpringContextHolder();
}
}
五、执行结果

异步、多线程、Java爬取某网站图片的更多相关文章
- Java爬虫实践--爬取CSDN网站图片为例
实现的效果,自动在工程下创建Pictures文件夹,根据网站URL爬取图片,层层获取.在Pictures下以网站的层级URL命名文件夹,用来装该层URL下的图片.同时将文件名,路径,URL插入数据库, ...
- python3 urllib爬取wallhalla网站图片
点我去我的github上看源码 简单使用静态方法爬取https://wallhalla.com/网站的图片 参考: https://blog.csdn.net/cquptcmj/article/det ...
- 使用nodejs+http(s)+events+cheerio+iconv-lite爬取2717网站图片数据到本地文件夹
源代码如下: //(node:9240) Warning: Setting the NODE_TLS_REJECT_UNAUTHORIZED environment variable to '0' ...
- python3爬取动态网站图片
思路: 1.图片放在<image>XXX</image>标签中 2.利用fiddler抓包获取存放图片信息的js文件url 3.利用requests库获取html内容,然后获取 ...
- 实战爬取某网站图片-Python
直接上代码 1 #!/usr/bin/python 2 # -*- coding: UTF-8 -*- 3 from bs4 import BeautifulSoup 4 import request ...
- Scrapy 爬取某网站图片
1. 创建一个 Scrapy 项目,在命令行或者 Pycharm 的 Terminal 中输入: scrapy startproject imagepix 自动生成了下列文件: 2. 在 imagep ...
- python爬取某个网站的图片并保存到本地
python爬取某个网站的图片并保存到本地 #coding:utf- import urllib import re import sys reload(sys) sys.setdefaultenco ...
- 初识python 之 爬虫:爬取某网站的壁纸图片
用到的主要知识点:requests.get 获取网页HTMLetree.HTML 使用lxml解析器解析网页xpath 使用xpath获取网页标签信息.图片地址request.urlretrieve ...
- Python多线程爬取某网站表情包
# 爬取网络图片import requestsfrom lxml import etreefrom urllib import requestfrom queue import Queue # 导入队 ...
- 使用Python爬虫爬取网络美女图片
代码地址如下:http://www.demodashi.com/demo/13500.html 准备工作 安装python3.6 略 安装requests库(用于请求静态页面) pip install ...
随机推荐
- 【Azure Fabric Service】怎样关闭 Azure Service Fabric?
问题描述 怎样关闭Azure Service Fabric服务呢?在Azure门户上没有找到 Stop 按钮. 问题回答 Azure Service Fabric 默认是无法停止的,可以删除. 虽然可 ...
- 《A Hierarchical Framework for Relation Extraction with Reinforcement Learning》论文阅读笔记
代码 原文地址 摘要 现有的大多数方法在确定关系类型之前,需要先识别出所有的实体,这样就忽略了实体提及和关系类型之间的交互.本文提出了一种新颖的联合抽取范式,把相关实体看作是关系的参数( 首先检测一个 ...
- 在winform中如何嵌入第三方软件窗体✨
相关win32api的学习 SetParent [DllImport("user32.dll ", EntryPoint = "SetParent")] pri ...
- IDEA使用与多线程
IDEA缩写和快捷键 psvm全称public static void main sout 全称public static void main alt+enter 处理异常 s.out 自动打印s c ...
- [VueJsDev] 快速入门 - 开发前小知识
[VueJsDev] 目录列表 https://www.cnblogs.com/pengchenggang/p/17037320.html 开发前小知识 ::: details 目录 目录 开发前小知 ...
- .vscode/extensions.json 是项目用到的 插件 推荐列表,项目应该将此配置 写入用到的插件
.vscode/extensions.json 是项目用到的 插件 推荐列表,项目应该将此配置 写入用到的插件 .vscode/extensions.json { "recommendati ...
- html添加css样式的两种方法
html添加css样式有三种方法,分别为行内式(使用style属性,在特定的HTML标签内使用).内嵌式(style标签把css代码放在特定页面的head部分中).外联式(使用link标签,将外部 ...
- 精通 Grails: 用 Groovy 服务器页面(GSP)改变视图
Groovy 服务器页面(Groovy Server Pages,GSP)将 Web 置于 Grails Web 框架之内.在 精通 Grails 系列的第三期中,Scott Davis 介绍了如何使 ...
- 展会回顾 | 2023元宇宙生态博览会圆满落幕,3DCAT荣获“元宇宙交互技术奖”
2023年5月10日-5月12日,一场涵盖了元宇宙终端头显.数字文娱.数字艺术.数字运动.数字多媒体展陈设计.数字展厅展馆.科技文旅.夜游演艺.沉浸式KTV/酒吧等多个领域的元宇宙商业盛会--2023 ...
- django项目(博客二)
扩展1:admin路由分发的本质 路由分发本质 include 可以无限制的 嵌套N多层 url(r'^index/',([],None,None)) 扩展2: 由于url方法第一个参数是正则表达式, ...