webcollector + selenium 爬取空间相册图片

 package cn.hb.util;

 import java.io.File;

 import java.io.FileNotFoundException;

 import java.io.FileWriter;

 import java.io.IOException;

 import java.util.ArrayList;

 import java.util.List;

 import java.util.Set;

 import java.util.UUID;

 import java.util.concurrent.TimeUnit;

 import org.apache.commons.io.IOUtils;

 import org.openqa.selenium.By;

 import org.openqa.selenium.Cookie;

 import org.openqa.selenium.JavascriptExecutor;

 import org.openqa.selenium.Keys;

 import org.openqa.selenium.WebDriver;

 import org.openqa.selenium.WebElement;

 import org.openqa.selenium.firefox.FirefoxDriver;

 import org.openqa.selenium.firefox.FirefoxOptions;

 import org.openqa.selenium.interactions.Actions;

 import cn.edu.hfut.dmic.webcollector.conf.Configuration;

 import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;

 import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;

 import cn.edu.hfut.dmic.webcollector.model.Page;

 import cn.edu.hfut.dmic.webcollector.net.HttpRequest;

 import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;

 import cn.edu.hfut.dmic.webcollector.util.FileUtils;

 /**

  * 爬取空间图片 selenium登录后提取链接给webcollector处理即可

  *

  * @author tele

  *

  */

 public class QZoneCrawler extends BreadthCrawler {

     static String url = "https://user.qzone.qq.com/qq号";

     static String cookies = "";

     static final int pageSize = 98;

     static List<String> crawdataList = new ArrayList<String>();

     static File baseDir = new File("F:/qz/image");

     public QZoneCrawler(String crawlPath, boolean autoParse) {

         super(crawlPath, autoParse);

     }

     @Override

     public void visit(Page page, CrawlDatums next) {

         try {

             Thread.sleep(3000);

         } catch (InterruptedException e) {

             e.printStackTrace();

         }

         String name = UUID.randomUUID().toString() + ".jpg";

         try {

             FileUtils.write(new File(baseDir, name), page.content());

         } catch (FileNotFoundException e) {

             e.printStackTrace();

         } catch (IOException e) {

             e.printStackTrace();

         }

     }

     String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0";

     // 设置cookies

     @Override

     public Page getResponse(CrawlDatum crawlDatum) throws Exception {

         HttpRequest request = new HttpRequest(crawlDatum);

         request.setCookie(cookies);

         request.setUserAgent(userAgent);

         return request.responsePage();

     }

     public static void main(String[] args) throws Exception {

         QZoneCrawler qz = new QZoneCrawler("F:/qz/image/webcollector", true);

         Configuration conf = Configuration.copyDefault();

         conf.setAutoDetectImg(true);

         conf.setConnectTimeout(5000);

         conf.setReadTimeout(10000);

         // 线程爬取间隔

         conf.setExecuteInterval(5000);

         qz.setConf(conf);

         qz.setThreads(100);

         login();

         qz.addSeed(crawdataList);

         qz.start(1);

     }

     /**

      * 登录

      *

      * @throws InterruptedException

      * @throws IOException

      */

     public static void login() throws InterruptedException, IOException {

         System.setProperty("webdriver.gecko.driver", "D:/browserdriver/geckodriver.exe");

         FirefoxOptions options = new FirefoxOptions();

         options.setBinary("F:/ff/firefox.exe");

         WebDriver driver = new FirefoxDriver(options);

         driver.manage().window().maximize();

         // 超时

         try {

             driver.manage().timeouts().pageLoadTimeout(3, TimeUnit.SECONDS);

             driver.manage().timeouts().setScriptTimeout(3, TimeUnit.SECONDS);

             driver.get(url);

         } catch (Exception e) {

             System.out.println("所需元素已出现,停止加载页面");

         } finally {

             // 切换到登录login

             driver.switchTo().frame("login_frame");

             WebElement switcher_plogin = driver.findElement(By.id("switcher_plogin"));

             System.out.println(switcher_plogin.getText());

             if (switcher_plogin.isDisplayed()) {

                 switcher_plogin.click();

             }

             // 用户名

             driver.findElement(By.id("u")).clear();

             driver.findElement(By.id("u")).sendKeys("账号");

             // 密码

             driver.findElement(By.id("p")).clear();

             driver.findElement(By.id("p")).sendKeys("密码");

             // 登录

             try {

                 driver.findElement(By.id("login_button")).click();

                 Thread.sleep(3000);

             } catch (Exception e) {

                 e.printStackTrace();

             } finally {

                 if ("https://i.qq.com/".equals(driver.getCurrentUrl())) {

                     System.out.println("登录失败!5秒后再次尝试登录");

                     Thread.sleep(5000);

                     driver.findElement(By.id("login_button")).click();

                 }

             }

             // 退出frame

             driver.switchTo().defaultContent();

             System.out.println(driver.getCurrentUrl());

             JavascriptExecutor jsExecutor = (JavascriptExecutor) driver;

             // 如果有亲密度提示

             /*

              * try { WebElement fs_guide = driver.findElement(By.xpath(

              * "//div[@id='friendship_promote_layer']/table[@class='tbl-fs-guide']//a"

              * )); if(fs_guide != null && fs_guide.isDisplayed()) {

              * fs_guide.click(); } } catch (Exception e) { e.printStackTrace();

              * }finally {

              *

              * }

              */

             // 点击相册

             driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_4>a")).click();

             Thread.sleep(2000);

             // 切换到frame

             driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));

             // 进入图片列表(说说相册)

             // driver.findElement(By.xpath("//ul[@class='js-album-list-ul']/li[1]/div[1]/div[1]/a")).click();

             // 拼接cookie

             StringBuilder builder = new StringBuilder();

             Set<Cookie> cookieSet = driver.manage().getCookies();

             cookieSet.forEach(c -> builder.append(c.getName()).append("=").append(c.getValue()).append("; "));

             cookies = builder.toString();

             // 获得相册列表

             List<WebElement> photoList = driver.findElements(By.xpath("//ul[@class='js-album-list-ul']/li"));

             if (photoList == null || photoList.size() == 0) {

                 throw new RuntimeException("定位相册列表元素失败!");

             }

             // 构造不同相册的xpath路径

             List<String> xpathList = new ArrayList<String>();

             for (int i = 0; i < photoList.size(); i++) {

                 xpathList.add("//ul[@class='js-album-list-ul']/li[" + (i + 1) + "]");

             }

             // 窗口句柄

             List<String> allHandles = new ArrayList<String>(driver.getWindowHandles());

             // 遍历xpath

             String newUrl = driver.getCurrentUrl();

             for (int i = 0; i < xpathList.size(); i++) {

                 // 打开新标签页

                 jsExecutor.executeScript("window.open('" + newUrl + "');");

                 allHandles = new ArrayList<String>(driver.getWindowHandles());

                 Thread.sleep(2000);

                 String xpath = xpathList.get(i);

                 // 句柄切换需要时间

                 driver.switchTo().window(allHandles.get(i + 1));

                 Thread.sleep(2000);

                 List<String> urlList = getImageUrl(driver, xpath);

                 if (urlList == null) {

                     break;

                 }

                 crawdataList.addAll(urlList);

             }

             System.out.println("所有相册图片链接提取完毕,退出浏览器");

             driver.quit();

         }

     }

     /**

      * 提取图片url

      *

      * @param driver

      * @param xpath

      * @throws InterruptedException

      * @throws IOException

      */

     public static List<String> getImageUrl(WebDriver driver, String xpath) throws InterruptedException, IOException {

         List<String> urlList = new ArrayList<String>();

         // 点击相册

         driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_4>a")).click();

         // 切换到图片的frame

         driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));

         Thread.sleep(1000);

         // 获得相册名称

         String photo_name = driver.findElement(By.xpath(xpath + "//a[@class='c-tx2 js-album-desc-a']")).getText();

         //// 文件夹检测

         File imageUrl = new File("f:/qz/" + photo_name + ".txt");

         if (!imageUrl.getParentFile().exists()) {

             imageUrl.mkdirs();

         } else {

             imageUrl.delete();

         }

         // 获得图片总数,每页最多98张图片

         WebElement span = driver.findElement(By.xpath(xpath + "/div[1]/div[1]/a" + "/span"));

         String text = span.getText();

         int count = Integer.parseInt(text);

         // 进入列表

         driver.findElement(By.xpath(xpath + "/div[1]/div[1]/a")).click();

         Thread.sleep(3000);

         // 计算页数

         int totalPage = (int) Math.ceil((double) count / (double) pageSize);

         System.out.println(photo_name + "图片总数为----" + count + "张,共计---" + totalPage + "页");

         FileWriter fileWriter = new FileWriter(imageUrl, true);

 　　　　　Actions actions = new Actions(driver);

         for (int i = 0; i < totalPage; i++) {

             // 模拟按键加载图片

      //       Actions actions = new Actions(driver);

             for (int j = 0; j < 50; j++) {

                 if (j % 5 == 0) {

                     Thread.sleep(1000);

                 }

                 actions.sendKeys(Keys.ARROW_DOWN).perform();

             }

             // 提取本页的image链接

             List<WebElement> list = driver.findElements(

                     By.xpath("//a[@class='item-cover j-pl-photoitem-imgctn']/img[@class='j-pl-photoitem-img']"));

             if (list == null || list.size() == 0) {

                 // 相册无权限访问或定位失败

                 System.out.println("无法提取图片链接!");

                 return null;

             }

             for (WebElement element : list) {

                 String src = element.getAttribute("src") + "\n";

                 IOUtils.write(src, fileWriter);

                 System.out.println(src);

                 // 添加链接

                 urlList.add(src);

             }

             System.out.println("第" + (i + 1) + "页图片链接提取完毕");

             Thread.sleep(1000);

             // 跳转到下一页

             if ((i + 2) <= totalPage) {

                 driver.findElement(By.xpath("//a[@id='pager_num_1_" + (i + 2) + "']")).click();

                 ;

             }

         }

         fileWriter.close();

         return urlList;

     }

 }

运行环境与上篇博文相同https://www.cnblogs.com/tele-share/p/9595265.html爬取结果

webcollector + selenium 爬取空间相册图片的更多相关文章

爬虫学习06用selenium爬取空间
用selenium爬取空间 from selenium import webdriver from lxml import etree import time pro = webdriver.Chro ...
selenium 爬取空间说说
package cn.hb.util; import java.io.File; import java.io.FileWriter; import java.io.IOException; impo ...
Python_小林的爬取QQ空间相册图片链接程序
前言昨天看见某人的空间有上传了XXXX个头像,然后我就想着下载回来[所以本质上这是一个头像下载程序],但是一个个另存为太浪费时间了,上网搜索有没有现成的工具,居然要注册码,还卖45一套.你们的良心也 ...
用WebCollector爬取站点的图片
用WebCollector爬取整站图片,仅仅须要遍历整站页面.然后将URL为.jpg.gif的页面(文件)保存到本地就可以. 比如我们爬取一个美食站点,获取里面全部的图片: import cn.edu ...
[Python爬虫] Selenium爬取新浪微博客户端用户信息、热点话题及评论 (上)
转载自:http://blog.csdn.net/eastmount/article/details/51231852 一. 文章介绍源码下载地址:http://download.csdn.net/ ...
selenium爬取煎蛋网
selenium爬取煎蛋网直接上代码 from selenium import webdriver from selenium.webdriver.support.ui import WebDriv ...
利用selenium爬取京东商品信息存放到mongodb
利用selenium爬取京东商城的商品信息思路: 1.首先进入京东的搜索页面,分析搜索页面信息可以得到路由结构 2.根据页面信息可以看到京东在搜索页面使用了懒加载,所以为了解决这个问题,使用递归.等待 ...
使用Python爬虫爬取网络美女图片
代码地址如下:http://www.demodashi.com/demo/13500.html 准备工作安装python3.6 略安装requests库(用于请求静态页面) pip install ...
使用Selenium爬取网站表格类数据
本文转载自一下网站:Python爬虫(5):Selenium 爬取东方财富网股票财务报表 https://www.makcyun.top/web_scraping_withpython5.html 需 ...

随机推荐

限制tomcat仅响应本机请求(转）
http://blog.bbzhh.com/index.php/archives/135.html 在VPS上搭建了nginx和tomcat应用,想通过nginx来反向代理127.0.0.1:8080 ...
iOS开发之CocoaPods（objective-c第三方库管理工具）
介绍: iOS开发中,大多数情况下,我们都须要集成一些第三方依赖库.对于一个稍大的项目,用到的第三方依赖库的数量也很可观.CocoaPods是objective-c第三方库管理工具,方便第三方库的管理 ...
Java核心技术卷Ⅰ 基础知识（7）
第13章集合集合接口具体的集合在表中,除了Map结尾的类之外,其他类都实现了Collection接口,而以Map结尾的类实现了Map接口. 链表数组列表散列集树集双端队列优先级队列 ...
关于python的冒号截取
https://zhidao.baidu.com/question/877855739656978372.html
学习笔记：mpvue开发小程序——入门
接下来可能要开发一个小程序,同事推荐使用mpvue,那么我提前熟悉下. 官网地址:http://mpvue.com/ 1.快速上手 http://mpvue.com/mpvue/quickstart/ ...
微服务实战（三）：深入微服务架构的进程间通信 - DockOne.io
原文:微服务实战(三):深入微服务架构的进程间通信 - DockOne.io [编者的话]这是采用微服务架构创建自己应用系列第三篇文章.第一篇介绍了微服务架构模式,和单体式模式进行了比较,并且讨论了使 ...
Codeforces 145A-Lucky Conversion(规律)
A. Lucky Conversion time limit per test 2 seconds memory limit per test 256 megabytes input standard ...
基于深度学习的人脸识别系统（Caffe+OpenCV+Dlib）【三】VGG网络进行特征提取
前言基于深度学习的人脸识别系统,一共用到了5个开源库:OpenCV(计算机视觉库).Caffe(深度学习库).Dlib(机器学习库).libfacedetection(人脸检测库).cudnn(gp ...
EL表达式.md
操作符描述 . 访问一个Bean属性或者一个映射条目 [] 访问一个数组或者链表的元素 ( ) 组织一个子表达式以改变优先级 + 加 - 减或负 * 乘 / or div 除 % or mod 取模 ...
UVA 11280 - Flying to Fredericton SPFA变形
http://uva.onlinejudge.org/index.php?option=com_onlinejudge&Itemid=8&page=show_problem&c ...

webcollector + selenium 爬取空间相册图片

webcollector + selenium 爬取空间相册图片的更多相关文章

随机推荐

热门专题