webcollector + selenium 爬取空间相册图片
package cn.hb.util; import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.TimeUnit; import org.apache.commons.io.IOUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.Keys;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxOptions;
import org.openqa.selenium.interactions.Actions;
import cn.edu.hfut.dmic.webcollector.conf.Configuration;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.util.FileUtils; /**
* 爬取空间图片 selenium登录后提取链接给webcollector处理即可
*
* @author tele
*
*/
public class QZoneCrawler extends BreadthCrawler {
static String url = "https://user.qzone.qq.com/qq号";
static String cookies = "";
static final int pageSize = 98;
static List<String> crawdataList = new ArrayList<String>();
static File baseDir = new File("F:/qz/image"); public QZoneCrawler(String crawlPath, boolean autoParse) {
super(crawlPath, autoParse);
} @Override
public void visit(Page page, CrawlDatums next) {
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
String name = UUID.randomUUID().toString() + ".jpg";
try {
FileUtils.write(new File(baseDir, name), page.content());
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
} String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"; // 设置cookies
@Override
public Page getResponse(CrawlDatum crawlDatum) throws Exception {
HttpRequest request = new HttpRequest(crawlDatum);
request.setCookie(cookies);
request.setUserAgent(userAgent);
return request.responsePage();
} public static void main(String[] args) throws Exception { QZoneCrawler qz = new QZoneCrawler("F:/qz/image/webcollector", true); Configuration conf = Configuration.copyDefault();
conf.setAutoDetectImg(true);
conf.setConnectTimeout(5000);
conf.setReadTimeout(10000); // 线程爬取间隔
conf.setExecuteInterval(5000);
qz.setConf(conf);
qz.setThreads(100); login();
qz.addSeed(crawdataList);
qz.start(1); } /**
* 登录
*
* @throws InterruptedException
* @throws IOException
*/
public static void login() throws InterruptedException, IOException {
System.setProperty("webdriver.gecko.driver", "D:/browserdriver/geckodriver.exe"); FirefoxOptions options = new FirefoxOptions();
options.setBinary("F:/ff/firefox.exe"); WebDriver driver = new FirefoxDriver(options);
driver.manage().window().maximize();
// 超时
try {
driver.manage().timeouts().pageLoadTimeout(3, TimeUnit.SECONDS);
driver.manage().timeouts().setScriptTimeout(3, TimeUnit.SECONDS);
driver.get(url);
} catch (Exception e) {
System.out.println("所需元素已出现,停止加载页面");
} finally {
// 切换到登录login
driver.switchTo().frame("login_frame"); WebElement switcher_plogin = driver.findElement(By.id("switcher_plogin"));
System.out.println(switcher_plogin.getText());
if (switcher_plogin.isDisplayed()) {
switcher_plogin.click();
}
// 用户名
driver.findElement(By.id("u")).clear();
driver.findElement(By.id("u")).sendKeys("账号"); // 密码
driver.findElement(By.id("p")).clear();
driver.findElement(By.id("p")).sendKeys("密码"); // 登录
try {
driver.findElement(By.id("login_button")).click();
Thread.sleep(3000);
} catch (Exception e) {
e.printStackTrace();
} finally {
if ("https://i.qq.com/".equals(driver.getCurrentUrl())) {
System.out.println("登录失败!5秒后再次尝试登录");
Thread.sleep(5000);
driver.findElement(By.id("login_button")).click();
}
} // 退出frame
driver.switchTo().defaultContent(); System.out.println(driver.getCurrentUrl()); JavascriptExecutor jsExecutor = (JavascriptExecutor) driver; // 如果有亲密度提示
/*
* try { WebElement fs_guide = driver.findElement(By.xpath(
* "//div[@id='friendship_promote_layer']/table[@class='tbl-fs-guide']//a"
* )); if(fs_guide != null && fs_guide.isDisplayed()) {
* fs_guide.click(); } } catch (Exception e) { e.printStackTrace();
* }finally {
*
* }
*/ // 点击相册
driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_4>a")).click(); Thread.sleep(2000); // 切换到frame
driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame"))); // 进入图片列表(说说相册)
// driver.findElement(By.xpath("//ul[@class='js-album-list-ul']/li[1]/div[1]/div[1]/a")).click(); // 拼接cookie
StringBuilder builder = new StringBuilder();
Set<Cookie> cookieSet = driver.manage().getCookies();
cookieSet.forEach(c -> builder.append(c.getName()).append("=").append(c.getValue()).append("; "));
cookies = builder.toString(); // 获得相册列表
List<WebElement> photoList = driver.findElements(By.xpath("//ul[@class='js-album-list-ul']/li"));
if (photoList == null || photoList.size() == 0) {
throw new RuntimeException("定位相册列表元素失败!");
} // 构造不同相册的xpath路径
List<String> xpathList = new ArrayList<String>();
for (int i = 0; i < photoList.size(); i++) {
xpathList.add("//ul[@class='js-album-list-ul']/li[" + (i + 1) + "]");
} // 窗口句柄
List<String> allHandles = new ArrayList<String>(driver.getWindowHandles()); // 遍历xpath
String newUrl = driver.getCurrentUrl();
for (int i = 0; i < xpathList.size(); i++) {
// 打开新标签页
jsExecutor.executeScript("window.open('" + newUrl + "');");
allHandles = new ArrayList<String>(driver.getWindowHandles()); Thread.sleep(2000);
String xpath = xpathList.get(i); // 句柄切换需要时间
driver.switchTo().window(allHandles.get(i + 1));
Thread.sleep(2000); List<String> urlList = getImageUrl(driver, xpath);
if (urlList == null) {
break;
}
crawdataList.addAll(urlList);
} System.out.println("所有相册图片链接提取完毕,退出浏览器");
driver.quit(); }
} /**
* 提取图片url
*
* @param driver
* @param xpath
* @throws InterruptedException
* @throws IOException
*/
public static List<String> getImageUrl(WebDriver driver, String xpath) throws InterruptedException, IOException {
List<String> urlList = new ArrayList<String>(); // 点击相册
driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_4>a")).click(); // 切换到图片的frame
driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));
Thread.sleep(1000); // 获得相册名称
String photo_name = driver.findElement(By.xpath(xpath + "//a[@class='c-tx2 js-album-desc-a']")).getText(); //// 文件夹检测
File imageUrl = new File("f:/qz/" + photo_name + ".txt");
if (!imageUrl.getParentFile().exists()) {
imageUrl.mkdirs();
} else {
imageUrl.delete();
} // 获得图片总数,每页最多98张图片
WebElement span = driver.findElement(By.xpath(xpath + "/div[1]/div[1]/a" + "/span"));
String text = span.getText();
int count = Integer.parseInt(text); // 进入列表
driver.findElement(By.xpath(xpath + "/div[1]/div[1]/a")).click();
Thread.sleep(3000); // 计算页数
int totalPage = (int) Math.ceil((double) count / (double) pageSize);
System.out.println(photo_name + "图片总数为----" + count + "张,共计---" + totalPage + "页"); FileWriter fileWriter = new FileWriter(imageUrl, true);
Actions actions = new Actions(driver);
for (int i = 0; i < totalPage; i++) { // 模拟按键加载图片
// Actions actions = new Actions(driver);
for (int j = 0; j < 50; j++) {
if (j % 5 == 0) {
Thread.sleep(1000);
}
actions.sendKeys(Keys.ARROW_DOWN).perform();
} // 提取本页的image链接
List<WebElement> list = driver.findElements(
By.xpath("//a[@class='item-cover j-pl-photoitem-imgctn']/img[@class='j-pl-photoitem-img']"));
if (list == null || list.size() == 0) {
// 相册无权限访问或定位失败
System.out.println("无法提取图片链接!");
return null;
}
for (WebElement element : list) {
String src = element.getAttribute("src") + "\n";
IOUtils.write(src, fileWriter);
System.out.println(src);
// 添加链接
urlList.add(src);
}
System.out.println("第" + (i + 1) + "页图片链接提取完毕");
Thread.sleep(1000);
// 跳转到下一页
if ((i + 2) <= totalPage) {
driver.findElement(By.xpath("//a[@id='pager_num_1_" + (i + 2) + "']")).click();
;
}
} fileWriter.close();
return urlList;
} }
运行环境与上篇博文相同https://www.cnblogs.com/tele-share/p/9595265.html爬取结果

webcollector + selenium 爬取空间相册图片的更多相关文章
- 爬虫学习06用selenium爬取空间
用selenium爬取空间 from selenium import webdriver from lxml import etree import time pro = webdriver.Chro ...
- selenium 爬取空间说说
package cn.hb.util; import java.io.File; import java.io.FileWriter; import java.io.IOException; impo ...
- Python_小林的爬取QQ空间相册图片链接程序
前言 昨天看见某人的空间有上传了XXXX个头像,然后我就想着下载回来[所以本质上这是一个头像下载程序],但是一个个另存为太浪费时间了,上网搜索有没有现成的工具,居然要注册码,还卖45一套.你们的良心也 ...
- 用WebCollector爬取站点的图片
用WebCollector爬取整站图片,仅仅须要遍历整站页面.然后将URL为.jpg.gif的页面(文件)保存到本地就可以. 比如我们爬取一个美食站点,获取里面全部的图片: import cn.edu ...
- [Python爬虫] Selenium爬取新浪微博客户端用户信息、热点话题及评论 (上)
转载自:http://blog.csdn.net/eastmount/article/details/51231852 一. 文章介绍 源码下载地址:http://download.csdn.net/ ...
- selenium爬取煎蛋网
selenium爬取煎蛋网 直接上代码 from selenium import webdriver from selenium.webdriver.support.ui import WebDriv ...
- 利用selenium爬取京东商品信息存放到mongodb
利用selenium爬取京东商城的商品信息思路: 1.首先进入京东的搜索页面,分析搜索页面信息可以得到路由结构 2.根据页面信息可以看到京东在搜索页面使用了懒加载,所以为了解决这个问题,使用递归.等待 ...
- 使用Python爬虫爬取网络美女图片
代码地址如下:http://www.demodashi.com/demo/13500.html 准备工作 安装python3.6 略 安装requests库(用于请求静态页面) pip install ...
- 使用Selenium爬取网站表格类数据
本文转载自一下网站:Python爬虫(5):Selenium 爬取东方财富网股票财务报表 https://www.makcyun.top/web_scraping_withpython5.html 需 ...
随机推荐
- [D3] Creating a D3 Force Layout in React
Learn how to leverage d3's layout module to create a Force Layout inside of React. We'll take a look ...
- Android 监听电量的状态
监控手机电量的变化和充电状态 在BatteryManager中有一个粘性广播,不需要BroadcastReceiver作为接收器,在注册时将传入null IntentFilter filter = n ...
- Android Mvvm模式的理解
1. Mvvm是什么,Mvvm是怎么来的?Mvvm模式广泛应用在WPF项目开发中,使用此模式可以把UI和业务逻辑分离开,使UI设计人员和业务逻辑人员能够分工明确. Mvvm模式是根据MVP模式来的,可 ...
- Android 仿今日头条频道管理(下)(GridView之间Item的移动和拖拽)
前言 上篇博客我们说到了今日头条频道管理的操作交互体验,我也介绍了2个GridView之间Item的相互移动.详情请參考:Android 仿今日头条频道管理(上)(GridView之间Item的移动和 ...
- UILabel基本用法
UILabel *_label = [[UILabel alloc]initWithFrame:CGRectMake(, self.view.frame.size.height*)]; _label. ...
- [Angular HTML] Implementing The Input Mask Cursor Navigation Functionality -- setSelectionRange
@HostListener('keydown', ['$event', '$event.keyCode']) onKeyDown($event: KeyboardEvent, keyCode) { i ...
- FragmentPagerAdapter和FragmentStatePagerAdapter的差别
ViewPager同意用户通过左右滑动显示不同页面的数据.而这些页面须要PagerAdapter管理. 经常使用的有FragmentPagerAdapter和FragmentStatePagerAda ...
- 度量空间(metric space)
一个度量空间(metric space)由一个有序对(ordered pair)(M,d) 表示,其中 M 是一种集合,d 是定义在 M 上的一种度量,是如下的一种函数映射: d:M×M→R 且对于任 ...
- ZOJ 1136 Longest Ordered Subsequence DP
传送门:http://acm.zju.edu.cn/onlinejudge/showProblem.do?problemId=1136 题目大意:给定一串序列,求最长的升序列长度,如1, 7, 3, ...
- C++中使用soap toolkit访问webService详解
使用Visual C++开发SOAP客户端应用 使用Visual C++开发SOAP客户端应用 简介 在本篇文章中,我们将讨论如何使用Visual C++开发一个简单的SOAP客户端应用程序,我们还 ...