一、网页图片爬取类

package com.yhyl.utils;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Component; import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; /**
* @program: springboot-sqlserver-elasticsearch-api
* @description
* @author: xbwen
* @create: 2021-07-22 16:40
**/
@Component
public class HtmlParseUtil {
public static void main(String[] args) throws IOException {
String url = "https://www.yeitu.com/meinv/xinggan/20180919_14722.html";
Set<String> urlList = new HashSet<>();
urlList.add(url);
ExecutorService executorService = Executors.newFixedThreadPool(24);
Set<String> parseUrlList = new HtmlParseUtil().parseUrl(urlList);
Set<String> dataList = new HtmlParseUtil().parseData(parseUrlList,executorService);
executorService.shutdown();
} //解析网页地址,递归模拟浏览器网页请求,获取自己想要得页面数
public Set<String> parseUrl(Set<String> urlList) throws IOException{
if(urlList.size()>=50){
return urlList;
}
Set<String> tempList = new HashSet<>();
for (String url : urlList) {
Document document = Jsoup.parse(new URL(url), 30000);
Elements elements = document.getElementsByTag("a");
for (Element element : elements) {
String href = element.attr("href");
if(href.endsWith(".html")){
tempList.add(href);
}
}
}
urlList.addAll(tempList);
urlList = parseUrl(urlList);
return urlList;
} //根据网页地址,模拟浏览器请求,获取页面图片链接地址,并下载到本地磁盘
public Set<String> parseData(Set<String> urlList, ExecutorService executorService){
Set<String> contents = new HashSet<>();
for (String url : urlList) {
executorService.execute(new Thread() {
@Override
public synchronized void run(){
try {
Document document = Jsoup.parse(new URL(url), 30000);
Elements imgBoxElements = document.getElementsByClass("img_box");
for (Element element : imgBoxElements) {
Elements imgElements = element.getElementsByTag("img");
for (Element imgElement : imgElements) {
String title = imgElement.attr("alt");
String img = imgElement.attr("src");
System.out.println("下载完成:"+title+"@"+img);
// HtmlParseUtil htmlParseUtilProxy = SpringContextHolder.getBean(HtmlParseUtil.class);
downloadPicture(title,img,"E:\\picture\\"+title+".jpg");
// contents.add(title+"@"+img);
}
}
}catch (Exception e){
e.printStackTrace();
} }
});
}
return contents;
} //链接url下载图片
@Async
public void downloadPicture(String imgTitle, String imgUrl, String path) throws Exception{
URL url = new URL(imgUrl);
// 打开连接
URLConnection conn = url.openConnection();
// HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setConnectTimeout(50000);
conn.setReadTimeout(50000);
// conn.setRequestProperty(":authority", "file.jiutuvip.com");
// conn.setRequestProperty(":method", "GET");
// conn.setRequestProperty(":path", "/2021/0105/20210105101307685.jpg");
// conn.setRequestProperty(":scheme","https");
conn.setRequestProperty("accept","image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8");
conn.setRequestProperty("accept-encoding", "gzip, deflate, br");
conn.setRequestProperty("accept-language", "zh-CN,zh;q=0.9");
conn.setRequestProperty("if-modified-since", "Tue, 05 Jan 2021 02:13:07 GMT");
conn.setRequestProperty("if-none-match", "5ff3cb33-6aa5");
conn.setRequestProperty("referer", "https://www.yeitu.net/");
conn.setRequestProperty("sec-ch-ua", " Not;A Brand;"+"v=\"99\", \"Google Chrome;"+"v=\"91\", \"Chromium;"+"v=\"91\"");
conn.setRequestProperty("sec-ch-ua-mobile", "?0");
conn.setRequestProperty("sec-fetch-dest", "image");
conn.setRequestProperty("sec-fetch-mode", "no-cors");
conn.setRequestProperty("sec-fetch-site", "cross-site");
conn.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36");
conn.connect(); // List<Object> objects = new ArrayList<>();
// 输入流
InputStream is = conn.getInputStream();
// objects.add(Base64Utils.encode(imgTitle.getBytes()));
// objects.add(is);
// jdbcTemplate.update("insert into image(image_title,image_stream) values (?,?)",objects.toArray());
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
// String filename = "D:\\图片下载/" + i + ".jpg"; //下载路径及下载图片名称
File file = new File(path);
FileOutputStream os = new FileOutputStream(file, true);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
} }

二、POM依赖文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.4.5</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.yhyl</groupId>
<artifactId>springboot-14-leetcode</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>springboot-14-leetcode</name>
<description>Demo project for Spring Boot</description>
<properties>
<java.version>1.8</java.version>
<spring-cloud.version>2020.0.2</spring-cloud.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-netflix-eureka-server</artifactId>
</dependency> <dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.20</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>com.zaxxer</groupId>
<artifactId>HikariCP</artifactId>
</dependency>
<dependency>
<groupId>com.microsoft.sqlserver</groupId>
<artifactId>mssql-jdbc</artifactId>
<version>8.4.1.jre8</version>
</dependency>
<!-- JDBC -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
</dependencies>
<!--<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-dependencies</artifactId>
<version>${spring-cloud.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>--> </project>

三、SpringBoot上下文配置

package com.yhyl.utils;

import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware; /**
* @author Jie
* @date 2019-01-07
*/
@Slf4j
public class SpringContextHolder implements ApplicationContextAware, DisposableBean { private static ApplicationContext applicationContext = null; /**
* 从静态变量applicationContext中取得Bean, 自动转型为所赋值对象的类型.
*/
@SuppressWarnings("unchecked")
public static <T> T getBean(String name) {
assertContextInjected();
return (T) applicationContext.getBean(name);
} /**
* 从静态变量applicationContext中取得Bean, 自动转型为所赋值对象的类型.
*/
public static <T> T getBean(Class<T> requiredType) {
assertContextInjected();
return applicationContext.getBean(requiredType);
} /**
* 检查ApplicationContext不为空.
*/
private static void assertContextInjected() {
if (applicationContext == null) {
throw new IllegalStateException("applicaitonContext属性未注入, 请在applicationContext" +
".xml中定义SpringContextHolder或在SpringBoot启动类中注册SpringContextHolder.");
}
} /**
* 清除SpringContextHolder中的ApplicationContext为Null.
*/
private static void clearHolder() {
log.debug("清除SpringContextHolder中的ApplicationContext:"
+ applicationContext);
applicationContext = null;
} @Override
public void destroy(){
SpringContextHolder.clearHolder();
} @Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
if (SpringContextHolder.applicationContext != null) {
log.warn("SpringContextHolder中的ApplicationContext被覆盖, 原有ApplicationContext为:" + SpringContextHolder.applicationContext);
}
SpringContextHolder.applicationContext = applicationContext;
}
}

四、SpringBoot启动类

@SpringBootApplication
@EnableAsync
public class LeectCodeApplication {
public static void main(String[] args) {
SpringApplication.run(LeectCodeApplication.class, args);
}
@Bean
public SpringContextHolder springContextHolder() {
return new SpringContextHolder();
} }

五、执行结果

异步、多线程、Java爬取某网站图片的更多相关文章

  1. Java爬虫实践--爬取CSDN网站图片为例

    实现的效果,自动在工程下创建Pictures文件夹,根据网站URL爬取图片,层层获取.在Pictures下以网站的层级URL命名文件夹,用来装该层URL下的图片.同时将文件名,路径,URL插入数据库, ...

  2. python3 urllib爬取wallhalla网站图片

    点我去我的github上看源码 简单使用静态方法爬取https://wallhalla.com/网站的图片 参考: https://blog.csdn.net/cquptcmj/article/det ...

  3. 使用nodejs+http(s)+events+cheerio+iconv-lite爬取2717网站图片数据到本地文件夹

    源代码如下:   //(node:9240) Warning: Setting the NODE_TLS_REJECT_UNAUTHORIZED environment variable to '0' ...

  4. python3爬取动态网站图片

    思路: 1.图片放在<image>XXX</image>标签中 2.利用fiddler抓包获取存放图片信息的js文件url 3.利用requests库获取html内容,然后获取 ...

  5. 实战爬取某网站图片-Python

    直接上代码 1 #!/usr/bin/python 2 # -*- coding: UTF-8 -*- 3 from bs4 import BeautifulSoup 4 import request ...

  6. Scrapy 爬取某网站图片

    1. 创建一个 Scrapy 项目,在命令行或者 Pycharm 的 Terminal 中输入: scrapy startproject imagepix 自动生成了下列文件: 2. 在 imagep ...

  7. python爬取某个网站的图片并保存到本地

    python爬取某个网站的图片并保存到本地 #coding:utf- import urllib import re import sys reload(sys) sys.setdefaultenco ...

  8. 初识python 之 爬虫:爬取某网站的壁纸图片

    用到的主要知识点:requests.get 获取网页HTMLetree.HTML 使用lxml解析器解析网页xpath 使用xpath获取网页标签信息.图片地址request.urlretrieve ...

  9. Python多线程爬取某网站表情包

    # 爬取网络图片import requestsfrom lxml import etreefrom urllib import requestfrom queue import Queue # 导入队 ...

  10. 使用Python爬虫爬取网络美女图片

    代码地址如下:http://www.demodashi.com/demo/13500.html 准备工作 安装python3.6 略 安装requests库(用于请求静态页面) pip install ...

随机推荐

  1. 【Azure Developer】开发模式下使用AAD账号访问Azure Blob的相关参考

    问题描述 开发模式下使用AAD账号访问Azure Blob的流程参考文件 问题解答 第一步:先在AAD中注册一个APP,步骤可参考: 将应用程序注册到 Microsoft 标识平台 :https:// ...

  2. 【Azure 媒体服务】AMS的Manifest文件中SmoothStreamingMedia片段中<c t="6161940" d="749970" r="2" n="0" />, c, t, d, r, n 的解析

    问题描述 在Azure媒体服务(AMS: Azure Media Service)中,不管是点播,直播都需要下载manifest文件.而文件中有一段[<c t="6161940&quo ...

  3. Java ----多线程 案例

    1 package bytezero.threadtest2; 2 3 /** 4 * 银行有一个账户 5 * 有两个储户分别向同一个账户存 3000元,每次存1000,存三次,每次存完打印账户余额 ...

  4. vid = two 切开 分开 - 两个眼睛 还有看的含义 - 词根

    vid = two 切开 分开 - 两个眼睛 还有看的含义 - 词根 vi = wo acs 构词

  5. async await $api vue

    async getDataNew () { const res = await this.$api('apiPath') if (res && res.status === 20) { ...

  6. C#使用Stateless和箭头控件实现状态机的控制及显示

    之前开发一个小工具,内部实现一个状态机,并显示状态机当前状态及状态间的转移过程.我使用了Stateless开源类库及一个开源自定义箭头控件.自定义箭头控件是HZHControls其中一个控件,我单独把 ...

  7. 2.4G无线音频一对多传输解决方案难点解析

    前记     2.4G无线音频传输是一个非主流的应用,做这个的人 相对要比较少.但是,这个领域所涉及到的知识却不少,也就导致了这个领域是好入门,但是东西想做好特别难.这里涉及到声学,无线协议,电子,设 ...

  8. 逆向通达信Level-2 续四 (调试level2数据接口)

    逆向通达信Level-2 续十一 (无帐号登陆itrend研究版) 逆向通达信Level-2 续十 (trace脱壳) 逆向通达信Level-2 续九 (无帐号打开itrend研究版) 逆向通达信Le ...

  9. 3DCAT首届行业生态交流会|爱智慧科技有限公司CEO梁新刚:工业元宇宙的”形“与”神“

    2021年12月17日下午,由深圳市瑞云科技有限公司主办,深圳市虚拟现实产业联合会协办的 云XR如何赋能元宇宙--3DCAT实时云渲染首届行业生态合作交流会 圆满落幕.此次活动围绕"云XR如 ...

  10. webapi通过docker部署到Linux的两种方式

    docker 安装官网 删除docker sudo yum remove docker \ docker-client \ docker-client-latest \ docker-common \ ...