package com.wang.xiaowei.utils;

import com.sun.image.codec.jpeg.JPEGCodec;
import com.sun.image.codec.jpeg.JPEGImageEncoder;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID; /**
* @author WXW on 2017/11/22.
*/
public class TieBaImageDownload { private static final RequestConfig REQUEST_CONFIG = RequestConfig.custom()
.setSocketTimeout(15000)
.setConnectTimeout(15000)
.setConnectionRequestTimeout(15000)
.build(); /**贴吧主路径*/
private static final String TB_BASE_URL = "https://tieba.baidu.com";
/**用于存放url*/
private static Map<String,String> URL_MAP;
/**图片保存的路径*/
private static final String IMAGE_SAVE_DIRECT = "E:/baiduimage/";
/**HttpClient对象*/
private static CloseableHttpClient httpClient = null;
/**每页有多少条帖子*/
private static final int EVERY_PAGE_COUNT_SIZE = 50; /**水印图片路径*/
private static final String WATER_IMAGE_PATH = "E://baiduimage//3.png";
/**透明度*/
private static final float WATER_IMAGE_ALPHA = 0.5F;
/**X间距*/
private static final int WATER_IMAGE_MARGIN_Y = 100;
/**Y间距*/
private static final int WATER_IMAGE_MARGIN_X = 100;
/**水印图片选中角度*/
private static final int WATER_IMAGE_RADIANS = 30; /**
* 获取指定贴吧的全部内容
* @param key 贴吧关键字
* @param maxPage 最大页码
* @param onlySeeLz 是否只看楼主
* @param addWaterImage 是否添加水印
* @throws Exception e
*/
public static void getHttpUrl(String key,int maxPage,boolean onlySeeLz,boolean addWaterImage) throws Exception{
//初始化httpclient对象
httpClient = HttpClients.createDefault(); //开始按照页面爬取内容
while(maxPage > 0){
System.out.println("=============正在处理第"+maxPage+"页===================");
//每页有50条数据
int pageIndex = (maxPage - 1) * EVERY_PAGE_COUNT_SIZE;
//路径
String spaderUrl = TB_BASE_URL + "/f?kw=" + key + "&ie=utf-8&pn="+ pageIndex;
System.out.println("spaderUrl==== "+spaderUrl);
String responseContent = getHtmlContent(spaderUrl);
processHtml(responseContent,onlySeeLz);
maxPage--;
}
httpClient.close();
downLoadImage(addWaterImage);
} /***
* 根据url获取页面源码内容
* @param url url
* @return 页面源码内容
* @throws Exception e
*/
private static String getHtmlContent(String url) throws Exception {
//get方式获取页面内容
HttpGet get = new HttpGet(url);
get.setConfig(REQUEST_CONFIG);
HttpEntity entity = httpClient.execute(get).getEntity();
return EntityUtils.toString(entity, "UTF-8");
} /***
* 处理html内容
* @param responseContent html内容
* @param onlySeeLz 是否只看楼主
* @throws Exception e
*/
private static void processHtml(String responseContent,boolean onlySeeLz) throws Exception {
Document doc = Jsoup.parse(responseContent);
//获取所有 class=j_th_tit 的 a标签;帖子的具体连接
Elements urls = doc.select("a.j_th_tit");
for (Element e : urls){
//帖子标题
String tText = e.text();
//帖子连接
String tUrl = e.attr("href");
tUrl = TB_BASE_URL + "" + tUrl;
//只看楼主
if(onlySeeLz){
tUrl = tUrl + "?see_lz=1";
}
//将获取到的帖子url放入Map
URL_MAP.put(tText,tUrl);
}
} /***
* 获取每个帖子内容中的图片信息
* @param addWaterImage 是否加水印
* @throws Exception e
*/
private static void downLoadImage(boolean addWaterImage) throws Exception {
for (String str : URL_MAP.values()){
//帖子的url
System.out.println("帖子的url=== "+str);
Document doc = Jsoup.connect(str).get();
//帖子中 class=img.BDE_Image的元素
Elements images = doc.select("img.BDE_Image");
for (Element e : images){
//获取图片url
String imageUrl = e.attr("src");
System.out.println("imageUrl============ "+imageUrl);
saveImage(imageUrl,addWaterImage);
}
}
} /**
* 将图片保存到本地
* @param imageUrl imageUrl
* @param addWaterImage addWaterImage
* @throws Exception e
*/
private static void saveImage(String imageUrl,boolean addWaterImage) throws Exception{
//每天创建一个目录
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
String filePath = sdf.format(new Date());
File imageFile = new File(IMAGE_SAVE_DIRECT+"//"+filePath);
if(!imageFile.exists()){
if (imageFile.mkdirs()){
System.out.println("---------创建目录成功-------------");
}
}
//随机生成图片名称
String fileName = UUID.randomUUID().toString().replaceAll("-","");
URL url = new URL(imageUrl);
InputStream is = url.openStream();
OutputStream os = new FileOutputStream(IMAGE_SAVE_DIRECT+"//"+filePath + "//" + fileName +".jpg");
if(addWaterImage){
addWaterImage(url,os);
}
if(!addWaterImage){
saveImageWithoutWaterImage(is,os);
}
} private static void saveImageWithoutWaterImage(InputStream is,OutputStream os) throws Exception{
byte[] buff = new byte[1024];
int readed;
while ((readed = is.read(buff)) != -1) {
os.write(buff, 0, readed);
}
is.close();
os.close();
} /***
* 打印水印
* @param sourceImagePath 原图片路径
* @param os os
* @throws Exception e
*/
private static void addWaterImage(URL sourceImagePath,OutputStream os) throws Exception{
//根据图片路径生成图片对象。获取图片的宽度高度
Image image = ImageIO.read(sourceImagePath);
int width = image.getWidth(null);
int height = image.getHeight(null); //根据图片的宽高,生成画布,将原图画到画布
BufferedImage bufferedImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
Graphics2D graphics2d = bufferedImage.createGraphics();
graphics2d.drawImage(image, 0, 0, width, height, null); //水印图片
Image waterImage = ImageIO.read(new File(WATER_IMAGE_PATH));
int waterImageWidth = waterImage.getWidth(null);
int waterImageHeight = waterImage.getHeight(null); //水印透明设置
graphics2d.setComposite(AlphaComposite.getInstance(AlphaComposite.SRC_ATOP, WATER_IMAGE_ALPHA));
//旋转 rotate(选中度数,圆心x坐标,圆心y坐标)
graphics2d.rotate(Math.toRadians(WATER_IMAGE_RADIANS), bufferedImage.getWidth()/2, bufferedImage.getHeight()/2); // 循环打印水印图片
int waterImageX = -width / 2;
while(waterImageX < width * 1.5){
int waterImageY = -height / 2;
while(waterImageY < height * 1.5){
graphics2d.drawImage(waterImage, waterImageX, waterImageY, null);
waterImageY += waterImageHeight + WATER_IMAGE_MARGIN_Y;
}
waterImageX += waterImageWidth + WATER_IMAGE_MARGIN_X;
}
graphics2d.dispose(); //创建图像编码工具类
JPEGImageEncoder en = JPEGCodec.createJPEGEncoder(os);
//使用图像编码工具类,输出缓存图像到目标文件
en.encode(bufferedImage);
os.close();
} public static void main(String[] args){
try {
URL_MAP = new HashMap<>();
boolean onlySeeLz = true;
String key = "柳岩";
int maxPage = 1;
boolean addWaterImage = true;
getHttpUrl(key,maxPage,onlySeeLz,addWaterImage);
} catch (Exception e) {
e.printStackTrace();
}
} }

JAVA爬取百度贴吧图片的更多相关文章

  1. Java爬取 百度图片Google图片Bing图片

    先看看抓取的结果. 8个Java类: Startup.java - main函数 ImageCrawler.java - Crawler基类 BaiduImageCrawler.java - 百度图片 ...

  2. Python简易爬虫爬取百度贴吧图片

    通过python 来实现这样一个简单的爬虫功能,把我们想要的图片爬取到本地.(Python版本为3.6.0) 一.获取整个页面数据 def getHtml(url): page=urllib.requ ...

  3. 【Python】Python简易爬虫爬取百度贴吧图片

    通过python 来实现这样一个简单的爬虫功能,把我们想要的图片爬取到本地.(Python版本为3.6.0) 一.获取整个页面数据 def getHtml(url): page=urllib.requ ...

  4. java爬取百度首页源代码

    爬虫感觉挺有意思的,写一个最简单的抓取百度首页html代码的程序.虽然简单了一点,后期会加深的. package test; import java.io.BufferedReader; import ...

  5. Python: 爬取百度贴吧图片

    练习之代码片段,以做备忘: # encoding=utf8 from __future__ import unicode_literals import urllib, urllib2 import ...

  6. Python爬取百度贴吧图片

    一.获取URL Urllib 模块提供了读取web页面数据的接口,我们可以像读取本地文件一样读取www和ftp上的数据.首先,我们定义了一个getHtml()函数: urllib.urlopen()方 ...

  7. Python每日一练(3):爬取百度贴吧图片

    import requests,re #先把要访问URL和头部准备好 url = 'http://tieba.baidu.com/p/2166231880' head = { 'Accept': '* ...

  8. 百度图片爬虫-python版-如何爬取百度图片?

    上一篇我写了如何爬取百度网盘的爬虫,在这里还是重温一下,把链接附上: http://www.cnblogs.com/huangxie/p/5473273.html 这一篇我想写写如何爬取百度图片的爬虫 ...

  9. python爬取某个网页的图片-如百度贴吧

    python爬取某个网页的图片-如百度贴吧 作者:vpoet mail:vpoet_sir@163.com 注:随意copy,不用告诉我 #coding:utf-8 import urllib imp ...

随机推荐

  1. Tomcat启动之异常java.lang.IllegalStateException

    严重: Exception sending context destroyed event to listener instance of class org.springframework.web. ...

  2. 96D - Police Stations

    96D - Police Stations 思路:bfs,从所有的警察局开始bfs,因为bfs的深度一样,而且题目给的树保证满足条件,所以不用考虑深度. 如果搜索到一个点a,他的下一个点b已经被搜索过 ...

  3. English trip V1 - 1.How Do You Feel Now? Teacher:Lamb Key:形容词(Adjectives)

    In this lesson you will learn to describe people, things, and feelings.在本课中,您将学习如何描述人,事和感受. STARTER  ...

  4. Spring Cloud 学习网址

    1. https://blog.csdn.net/forezp/article/details/70148833  史上最简单的 SpringCloud 教程 (非常适合新手快速上手教程)2.http ...

  5. android--------微信 Tinker 热修复 (二)

    前面简单介绍了一下Tinker热修复,今天就来分享一下如何在Android中使用,希望对各位有帮助. 1:Tinker 接入指南 在项目的build.gradle中,添加tinker-patch-gr ...

  6. Leetcode 92

    /** * Definition for singly-linked list. * struct ListNode { * int val; * ListNode *next; * ListNode ...

  7. java.lang.Exception: Socket bind failed: [730048]

    严重: Error initializing endpoint java.lang.Exception: Socket bind failed: [730048] ?????????×???(Э?é/ ...

  8. spring cloud学习(六)Spring Cloud Config

    Spring Cloud Config 参考个人项目 参考个人项目 : (希望大家能给个star~) https://github.com/FunriLy/springcloud-study/tree ...

  9. NOIP初赛 BLESS ALL!

    祝初赛顺利!RP++! 下午再写一篇题解来加RP

  10. jsp el函数库

    EL函数库介绍 由于在JSP页面中显示数据时,经常需要对显示的字符串进行处理,所以SUN公司针对一些常见的处理定义了一套EL函数库供开发者使用. 这些EL函数在JSTL开发包中进行描述,因此在JSP页 ...