selenium 爬取空间说说

 package cn.hb.util;

 import java.io.File;

 import java.io.FileWriter;

 import java.io.IOException;

 import java.util.Set;

 import java.util.concurrent.TimeUnit;

 import org.openqa.selenium.By;

 import org.openqa.selenium.Cookie;

 import org.openqa.selenium.JavascriptExecutor;

 import org.openqa.selenium.Keys;

 import org.openqa.selenium.WebDriver;

 import org.openqa.selenium.WebElement;

 import org.openqa.selenium.firefox.FirefoxDriver;

 import org.openqa.selenium.firefox.FirefoxOptions;

 import org.openqa.selenium.interactions.Actions;

 /**

  * 爬取说说写入到txt中,爬取100条

  *

  * @author tele

  *

  */

 public class QZTwitterCrawler {

     static String url = "https://user.qzone.qq.com/1350560858";

     static int maxSize = ;

     static int pageSize = ;

     static String userName="qq";

     static String pwd = "密码";

     public static void main(String[] args) throws InterruptedException, IOException {

         login();

     }

     /**

      * 登录

      *

      * @throws InterruptedException

      * @throws IOException

      */

     public static void login() throws InterruptedException, IOException {

         System.setProperty("webdriver.gecko.driver", "D:/browserdriver/geckodriver.exe");

         FirefoxOptions options = new FirefoxOptions();

         options.setBinary("F:/ff/firefox.exe");

         WebDriver driver = new FirefoxDriver(options);

         driver.manage().window().maximize();

         // 超时

         try {

             driver.manage().timeouts().pageLoadTimeout(, TimeUnit.SECONDS);

             driver.manage().timeouts().setScriptTimeout(, TimeUnit.SECONDS);

             driver.get(url);

         } catch (Exception e) {

             System.out.println("所需元素已出现,停止加载页面");

         } finally {

             // 切换到登录login

             driver.switchTo().frame("login_frame");

             WebElement switcher_plogin = driver.findElement(By.id("switcher_plogin"));

             System.out.println(switcher_plogin.getText());

             if (switcher_plogin.isDisplayed()) {

                 switcher_plogin.click();

             }

             // 用户名

             driver.findElement(By.id("u")).clear();

             driver.findElement(By.id("u")).sendKeys(userName);

             // 密码

             driver.findElement(By.id("p")).clear();

             driver.findElement(By.id("p")).sendKeys(pwd);

             // 登录

             try {

                 driver.findElement(By.id("login_button")).click();

                 Thread.sleep();

             } catch (Exception e) {

                 e.printStackTrace();

             } finally {

                 if ("https://i.qq.com/".equals(driver.getCurrentUrl())) {

                     System.out.println("登录失败!5秒后再次尝试登录");

                     Thread.sleep();

                     driver.findElement(By.id("login_button")).click();

                 }

             }

             // 退出frame

             driver.switchTo().defaultContent();

             System.out.println(driver.getCurrentUrl());

             JavascriptExecutor jsExecutor = (JavascriptExecutor) driver;

             // 如果有亲密度提示

               try { WebElement fs_guide = driver.findElement(By.xpath(

               "//div[@id='friendship_promote_layer']/table[@class='tbl-fs-guide']//a"

              )); if(fs_guide != null && fs_guide.isDisplayed()) {

               fs_guide.click(); } } catch (Exception e) { e.printStackTrace();

               }finally {

               }

             // 点击说说

             driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_311>a")).click();

             Thread.sleep();

             // 切换到frame

             driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));

             Thread.sleep();

             // 拼接cookie

         /*    StringBuilder builder = new StringBuilder();

             Set<Cookie> cookieSet = driver.manage().getCookies();

             cookieSet.forEach(c -> builder.append(c.getName()).append("=").append(c.getValue()).append("; "));

             cookies = builder.toString();*/

             // 保存

             saveTwitter(driver);

             System.out.println("内容提取完毕,退出浏览器");

             driver.quit();

         }

     }

     /**

      * 序列化

      * @param driver

      * @return

      * @throws InterruptedException

      * @throws IOException

      */

     public static void saveTwitter(WebDriver driver) throws InterruptedException, IOException {

         File file = new File("f:/qz/twitter.txt");

         // 文件夹检测

         if (!file.getParentFile().exists()) {

             file.mkdirs();

         } else {

             file.delete();

         }

         FileWriter fileWriter = new FileWriter(file, true);

         String xpath;

         // 模拟按键进行滚动

         Actions actions = new Actions(driver);

         //说说总量

         String totalNumStr = driver.findElement(By.xpath("//div[@class='feed_num']/a")).getText();

         int totalNum = Integer.parseInt(totalNumStr);

         // 计算页数

         int totalPage = (int) Math.ceil((double)Math.min(maxSize, totalNum) / (double) pageSize);

         // 构造xpath

         for (int i = ; i < totalPage; i++) {

             for (int j = ; j < pageSize; j++) {

                 xpath = "//ol[@id='msgList']/li[" + (j + ) + "]/div[3]/div[2]/pre[@class='content']";

                 // 获取说说内容

                 try {

                     WebElement element = driver.findElement(By.xpath(xpath));

                     String text = element.getText();

                     System.out.println("本页第" + (j + ) + "条   :" + text);

                     fileWriter.write(text, , text.length());

                 } catch (Exception e) {

                     e.printStackTrace();

                 } finally {

                 }

                 if (j %  == ) {

                     actions.sendKeys(Keys.ARROW_DOWN).perform();

                 }

             }

             System.out.println("第" + (i + ) + "页说说爬取完毕");

             // 分页

             if ((i + ) <= totalPage) {

                 driver.findElement(By.xpath("//a[@id='pager_num_" + i + "_" + (i + ) + "']")).click();

                 // 等待页面加载

                 Thread.sleep();

             }

         }

         if (fileWriter != null) {

             fileWriter.close();

         }

     }

 }

比爬取相册简单点,唯一有点弯的是页码的构造了,我写的这个只支持获取文字,可以用来生成词云

selenium 爬取空间说说的更多相关文章

爬虫学习06用selenium爬取空间
用selenium爬取空间 from selenium import webdriver from lxml import etree import time pro = webdriver.Chro ...
webcollector + selenium 爬取空间相册图片
package cn.hb.util; import java.io.File; import java.io.FileNotFoundException; import java.io.FileWr ...
[Python爬虫] Selenium爬取新浪微博客户端用户信息、热点话题及评论 (上)
转载自:http://blog.csdn.net/eastmount/article/details/51231852 一. 文章介绍源码下载地址:http://download.csdn.net/ ...
selenium爬取煎蛋网
selenium爬取煎蛋网直接上代码 from selenium import webdriver from selenium.webdriver.support.ui import WebDriv ...
利用selenium爬取京东商品信息存放到mongodb
利用selenium爬取京东商城的商品信息思路: 1.首先进入京东的搜索页面,分析搜索页面信息可以得到路由结构 2.根据页面信息可以看到京东在搜索页面使用了懒加载,所以为了解决这个问题,使用递归.等待 ...
利用Selenium爬取淘宝商品信息
一. Selenium和PhantomJS介绍 Selenium是一个用于Web应用程序测试的工具,Selenium直接运行在浏览器中,就像真正的用户在操作一样.由于这个性质,Selenium也是一 ...
Scrapy 框架使用 selenium 爬取动态加载内容
使用 selenium 爬取动态加载内容开启中间件 DOWNLOADER_MIDDLEWARES = { 'wangyiPro.middlewares.WangyiproDownloaderMidd ...
使用selenium爬取网站动态数据
处理页面动态加载的爬取 selenium selenium是python的一个第三方库,可以实现让浏览器完成自动化的操作,比如说点击按钮拖动滚轮等环境搭建: 安装:pip install selen ...
scrapy框架 + selenium 爬取豆瓣电影top250......
废话不说,直接上代码..... 目录结构 items.py import scrapy class DoubanCrawlerItem(scrapy.Item): # 电影名称 movieName = ...

随机推荐

2.Docker初体验【Docker每天5分钟】
原文:2.Docker初体验[Docker每天5分钟] Docker给PaaS世界带来的“降维打击”,其实是提供了一种非常便利的打包机制.该机制打包了应用运行所需要的整个操作系统,从而保证了本地环境和 ...
【Codeforces Round #445 (Div. 2) D】Restoration of string
[链接] 我是链接,点我呀:) [题意] 给你n个字符串. 让你构造一个字符串s. 使得这n个字符串. 每个字符串都是s的子串. 且都是出现次数最多的子串. 要求s的长度最短,且s的字典序最小. [题 ...
UVA 11624 - Fire! 图BFS
看题传送门昨天晚上UVA上不去今天晚上才上得去,这是在维护么? 然后去看了JAVA,感觉还不错昂~ 晚上上去UVA后经常连接失败作死啊. 第一次做图的题~ 基本是照着抄的T T 不过搞懂了图的BFS ...
Linux中U盘和SD卡加载卸载命令
U盘挂载命令U盘插入的时候会显示启动信息,启动信息中sda: sda1指U盘的设备名为sda1dev设备目录下有一个sda1设备文件,此设备文件就是我们插入的U盘,我们将这个设备文件挂载到Linux系 ...
与Eclipse关于"Call Hierarchy"和"Find Reference"功能比较
"Call Hierarchy"功能比较 Eclipse的"Call Hierarchy"可以查看一个Java方法或类成员变量的调用树(caller和calle ...
Perl自动释放Licence启动Verdi
Perl自动释放Licence启动Verdi 在工作中,遇到verdi的License不够的情况,某些人占用了多个License,为及时获得一个可用的License,写了一个perl来kill运行时间 ...
C++设计模式实现--备忘录(Memento)模式
一. 备忘录模式定义:在不破坏封装性的前提下,捕获一个对象的内部状态.并在该对象之外保存这个状态. 这样以后就可将该对象恢复到原先保存的状态. 结构图: 使用范围: Memento 模式比較适用于功 ...
[Docker] Run, Stop and Remove Docker Containers
In this lesson, we'll find out the basics of running Docker containers. We'll go over how to downloa ...
xv6进程切换-swtch函数
https://blog.csdn.net/Swartz2015/article/details/61615603 xv6进程切换-swtch函数进程切换中由于需要保存当前进程的寄存器状态信息,又要 ...
[Angular] Auxiliary named router outlets
Define a auxilliary router: export const ROUTES: Routes = [ { path: 'folder/:name', component: MailF ...

selenium 爬取空间说说

selenium 爬取空间说说的更多相关文章

随机推荐

热门专题