JAVA爬取百度贴吧图片
package com.wang.xiaowei.utils; import com.sun.image.codec.jpeg.JPEGCodec;
import com.sun.image.codec.jpeg.JPEGImageEncoder;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID; /**
* @author WXW on 2017/11/22.
*/
public class TieBaImageDownload { private static final RequestConfig REQUEST_CONFIG = RequestConfig.custom()
.setSocketTimeout(15000)
.setConnectTimeout(15000)
.setConnectionRequestTimeout(15000)
.build(); /**贴吧主路径*/
private static final String TB_BASE_URL = "https://tieba.baidu.com";
/**用于存放url*/
private static Map<String,String> URL_MAP;
/**图片保存的路径*/
private static final String IMAGE_SAVE_DIRECT = "E:/baiduimage/";
/**HttpClient对象*/
private static CloseableHttpClient httpClient = null;
/**每页有多少条帖子*/
private static final int EVERY_PAGE_COUNT_SIZE = 50; /**水印图片路径*/
private static final String WATER_IMAGE_PATH = "E://baiduimage//3.png";
/**透明度*/
private static final float WATER_IMAGE_ALPHA = 0.5F;
/**X间距*/
private static final int WATER_IMAGE_MARGIN_Y = 100;
/**Y间距*/
private static final int WATER_IMAGE_MARGIN_X = 100;
/**水印图片选中角度*/
private static final int WATER_IMAGE_RADIANS = 30; /**
* 获取指定贴吧的全部内容
* @param key 贴吧关键字
* @param maxPage 最大页码
* @param onlySeeLz 是否只看楼主
* @param addWaterImage 是否添加水印
* @throws Exception e
*/
public static void getHttpUrl(String key,int maxPage,boolean onlySeeLz,boolean addWaterImage) throws Exception{
//初始化httpclient对象
httpClient = HttpClients.createDefault(); //开始按照页面爬取内容
while(maxPage > 0){
System.out.println("=============正在处理第"+maxPage+"页===================");
//每页有50条数据
int pageIndex = (maxPage - 1) * EVERY_PAGE_COUNT_SIZE;
//路径
String spaderUrl = TB_BASE_URL + "/f?kw=" + key + "&ie=utf-8&pn="+ pageIndex;
System.out.println("spaderUrl==== "+spaderUrl);
String responseContent = getHtmlContent(spaderUrl);
processHtml(responseContent,onlySeeLz);
maxPage--;
}
httpClient.close();
downLoadImage(addWaterImage);
} /***
* 根据url获取页面源码内容
* @param url url
* @return 页面源码内容
* @throws Exception e
*/
private static String getHtmlContent(String url) throws Exception {
//get方式获取页面内容
HttpGet get = new HttpGet(url);
get.setConfig(REQUEST_CONFIG);
HttpEntity entity = httpClient.execute(get).getEntity();
return EntityUtils.toString(entity, "UTF-8");
} /***
* 处理html内容
* @param responseContent html内容
* @param onlySeeLz 是否只看楼主
* @throws Exception e
*/
private static void processHtml(String responseContent,boolean onlySeeLz) throws Exception {
Document doc = Jsoup.parse(responseContent);
//获取所有 class=j_th_tit 的 a标签;帖子的具体连接
Elements urls = doc.select("a.j_th_tit");
for (Element e : urls){
//帖子标题
String tText = e.text();
//帖子连接
String tUrl = e.attr("href");
tUrl = TB_BASE_URL + "" + tUrl;
//只看楼主
if(onlySeeLz){
tUrl = tUrl + "?see_lz=1";
}
//将获取到的帖子url放入Map
URL_MAP.put(tText,tUrl);
}
} /***
* 获取每个帖子内容中的图片信息
* @param addWaterImage 是否加水印
* @throws Exception e
*/
private static void downLoadImage(boolean addWaterImage) throws Exception {
for (String str : URL_MAP.values()){
//帖子的url
System.out.println("帖子的url=== "+str);
Document doc = Jsoup.connect(str).get();
//帖子中 class=img.BDE_Image的元素
Elements images = doc.select("img.BDE_Image");
for (Element e : images){
//获取图片url
String imageUrl = e.attr("src");
System.out.println("imageUrl============ "+imageUrl);
saveImage(imageUrl,addWaterImage);
}
}
} /**
* 将图片保存到本地
* @param imageUrl imageUrl
* @param addWaterImage addWaterImage
* @throws Exception e
*/
private static void saveImage(String imageUrl,boolean addWaterImage) throws Exception{
//每天创建一个目录
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
String filePath = sdf.format(new Date());
File imageFile = new File(IMAGE_SAVE_DIRECT+"//"+filePath);
if(!imageFile.exists()){
if (imageFile.mkdirs()){
System.out.println("---------创建目录成功-------------");
}
}
//随机生成图片名称
String fileName = UUID.randomUUID().toString().replaceAll("-","");
URL url = new URL(imageUrl);
InputStream is = url.openStream();
OutputStream os = new FileOutputStream(IMAGE_SAVE_DIRECT+"//"+filePath + "//" + fileName +".jpg");
if(addWaterImage){
addWaterImage(url,os);
}
if(!addWaterImage){
saveImageWithoutWaterImage(is,os);
}
} private static void saveImageWithoutWaterImage(InputStream is,OutputStream os) throws Exception{
byte[] buff = new byte[1024];
int readed;
while ((readed = is.read(buff)) != -1) {
os.write(buff, 0, readed);
}
is.close();
os.close();
} /***
* 打印水印
* @param sourceImagePath 原图片路径
* @param os os
* @throws Exception e
*/
private static void addWaterImage(URL sourceImagePath,OutputStream os) throws Exception{
//根据图片路径生成图片对象。获取图片的宽度高度
Image image = ImageIO.read(sourceImagePath);
int width = image.getWidth(null);
int height = image.getHeight(null); //根据图片的宽高,生成画布,将原图画到画布
BufferedImage bufferedImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
Graphics2D graphics2d = bufferedImage.createGraphics();
graphics2d.drawImage(image, 0, 0, width, height, null); //水印图片
Image waterImage = ImageIO.read(new File(WATER_IMAGE_PATH));
int waterImageWidth = waterImage.getWidth(null);
int waterImageHeight = waterImage.getHeight(null); //水印透明设置
graphics2d.setComposite(AlphaComposite.getInstance(AlphaComposite.SRC_ATOP, WATER_IMAGE_ALPHA));
//旋转 rotate(选中度数,圆心x坐标,圆心y坐标)
graphics2d.rotate(Math.toRadians(WATER_IMAGE_RADIANS), bufferedImage.getWidth()/2, bufferedImage.getHeight()/2); // 循环打印水印图片
int waterImageX = -width / 2;
while(waterImageX < width * 1.5){
int waterImageY = -height / 2;
while(waterImageY < height * 1.5){
graphics2d.drawImage(waterImage, waterImageX, waterImageY, null);
waterImageY += waterImageHeight + WATER_IMAGE_MARGIN_Y;
}
waterImageX += waterImageWidth + WATER_IMAGE_MARGIN_X;
}
graphics2d.dispose(); //创建图像编码工具类
JPEGImageEncoder en = JPEGCodec.createJPEGEncoder(os);
//使用图像编码工具类,输出缓存图像到目标文件
en.encode(bufferedImage);
os.close();
} public static void main(String[] args){
try {
URL_MAP = new HashMap<>();
boolean onlySeeLz = true;
String key = "柳岩";
int maxPage = 1;
boolean addWaterImage = true;
getHttpUrl(key,maxPage,onlySeeLz,addWaterImage);
} catch (Exception e) {
e.printStackTrace();
}
} }
JAVA爬取百度贴吧图片的更多相关文章
- Java爬取 百度图片Google图片Bing图片
先看看抓取的结果. 8个Java类: Startup.java - main函数 ImageCrawler.java - Crawler基类 BaiduImageCrawler.java - 百度图片 ...
- Python简易爬虫爬取百度贴吧图片
通过python 来实现这样一个简单的爬虫功能,把我们想要的图片爬取到本地.(Python版本为3.6.0) 一.获取整个页面数据 def getHtml(url): page=urllib.requ ...
- 【Python】Python简易爬虫爬取百度贴吧图片
通过python 来实现这样一个简单的爬虫功能,把我们想要的图片爬取到本地.(Python版本为3.6.0) 一.获取整个页面数据 def getHtml(url): page=urllib.requ ...
- java爬取百度首页源代码
爬虫感觉挺有意思的,写一个最简单的抓取百度首页html代码的程序.虽然简单了一点,后期会加深的. package test; import java.io.BufferedReader; import ...
- Python: 爬取百度贴吧图片
练习之代码片段,以做备忘: # encoding=utf8 from __future__ import unicode_literals import urllib, urllib2 import ...
- Python爬取百度贴吧图片
一.获取URL Urllib 模块提供了读取web页面数据的接口,我们可以像读取本地文件一样读取www和ftp上的数据.首先,我们定义了一个getHtml()函数: urllib.urlopen()方 ...
- Python每日一练(3):爬取百度贴吧图片
import requests,re #先把要访问URL和头部准备好 url = 'http://tieba.baidu.com/p/2166231880' head = { 'Accept': '* ...
- 百度图片爬虫-python版-如何爬取百度图片?
上一篇我写了如何爬取百度网盘的爬虫,在这里还是重温一下,把链接附上: http://www.cnblogs.com/huangxie/p/5473273.html 这一篇我想写写如何爬取百度图片的爬虫 ...
- python爬取某个网页的图片-如百度贴吧
python爬取某个网页的图片-如百度贴吧 作者:vpoet mail:vpoet_sir@163.com 注:随意copy,不用告诉我 #coding:utf-8 import urllib imp ...
随机推荐
- android中 检查网络连接状态的变化,无网络时跳转到设置界面
1:在AndroidManifest.xml中加一个声明 <receiver android:name="NetCheckReceiver"> <inten ...
- Unity2017烘焙参数设置
- socket+django
1.socket 网络上任意两个程序之间要进行通信,需要依靠socket(端口).socket封装了TCP/IP协议,让网络通信基于TCP/IP协议的形式实现. socket可以翻译为插座,那么一个服 ...
- 富文本编辑器layedit,调用setContent方法会报错
需要把layedit.js里的setContent 函数的 layedit.sync(index)); 改成 this.sync(index));
- 14西安区域赛C - The Problem Needs 3D Arrays
最大密度子图裸题,详情请见胡博涛论文: https://wenku.baidu.com/view/986baf00b52acfc789ebc9a9.html 不加当前弧优化t到死= = //#prag ...
- 模拟模拟vij1120
花生采摘 描述 鲁宾逊先生有一只宠物猴,名叫多多.这天,他们两个正沿着乡间小路散步,突然发现路的告示牌上贴着一张小小的纸条:“欢迎免费品尝我种的花生!——熊字”. 鲁宾逊先生和多多都很开心,因为花生正 ...
- UVA-1220 Party at Hali-Bula (树的最大独立集)
题目大意:数的最大独立集问题.特殊在要求回答答案是否唯一. 题目分析:定义状态dp(i,1),dp(i,0)分别表示以i为根节点的子树选不选i最多可选的人数,f(i,1),f(i,0)分别表示以i为根 ...
- Oracle性能诊断艺术-读书笔记(范围分区)
1. PARTITION RANGE SINGLE 注意:操作2 中的 TABLE ACCESS FULL 并不是全表扫描,只是对分区1 做 全分区扫描 case2 2. 分区范围迭代(PARTITI ...
- Pl/sql Dev登录数据库查询报ORA-03114
pl/sql dev能正常登录数据库,但是登录后执行查询报 ORA-03114异常. 1.建议安装PL/SQL Dev的32位版,64位版不太好用,且需要64位对应的OCI.DLL. 64位简直不要太 ...
- OC MRC之set方法内存管理(代码分析)
// // main.m // 03-set方法的内存管理 // // Created by apple on 13-8-9. // Copyright (c) 2013年 itcast. All r ...