selenuim爬虫实战 (下)

SuperLOFTERDownloader7.java

package test;

import java.io.IOException;

import java.util.ArrayList;

import java.util.Timer;

import java.util.TimerTask;

import java.util.concurrent.TimeUnit;

import javax.swing.JOptionPane;

import org.openqa.selenium.By;

import org.openqa.selenium.JavascriptExecutor;

import org.openqa.selenium.Keys;

import org.openqa.selenium.WebElement;

import org.openqa.selenium.firefox.FirefoxBinary;

import org.openqa.selenium.firefox.FirefoxDriver;

import org.openqa.selenium.firefox.FirefoxOptions;

import org.openqa.selenium.interactions.Actions;

public class SuperLOFTERDownloader7 {

    // static String html;

    static int urlCount = 0;

    static String username = "fenchenyue";

    static String url = "http://" + username + ".lofter.com/view";

    static String picUrl;

    static boolean ifToBreak = false;

    static ArrayList<String> urlList = new ArrayList<String>();

    // 注意:目录路径必须以"\"结尾

    static String downloadToDir = "E:\\爬虫\\妹纸2\\";

    public static void main(String[] args) {

        System.setProperty("webdriver.gecko.driver", "C:\\Users\\Jim\\Desktop\\GeckoDriver\\geckodriver.exe");

        FirefoxOptions options = new FirefoxOptions();

        // 启动配置"不加载图片"

        options.addPreference("permissions.default.image", 2);

        // 启动参数"无界面"

        FirefoxBinary myBinary = new FirefoxBinary();

        myBinary.addCommandLineOptions("--headless");

        options.setBinary(myBinary);

        FirefoxDriver driver = new FirefoxDriver(options);

        driver.manage().timeouts().implicitlyWait(16, TimeUnit.SECONDS); // 隐式等待

        driver.get(url);

        // ((JavascriptExecutor)

        // driver).executeScript("document.querySelector('.m-txtsch').style.display=\"inline\"");

        int givenNumber = Integer.parseInt(driver.findElementByCssSelector(

                "body > div.g-bdfull.g-bdfull-show.ztag > div.g-bdc.ztag > div.m-fbar.f-cb > div.schbtn.f-cb > div:nth-child(1) > div > div.txt > a.ztag.currt > span")

                .getAttribute("innerHTML"));

        Actions action = new Actions(driver);

        Timer timer = new Timer();

        timer.schedule(new TimerTask() {

            @Override

            public void run() {

                // TODO Auto-generated method stub

                action.sendKeys(/* driver.findElement(By.cssSelector("body")), */ Keys.END).perform();

            }

        }, 1, 700);

        /*

         * WebDriverWait wait = new WebDriverWait(driver, 256, 2048);

         * wait.until(ExpectedConditions .numberOfElementsToBe(By.

         * cssSelector("div.ztag > div.m-filecnt.m-filecnt-1 > ul > li"), number));

         */

        new Thread(new Runnable() {

            @Override

            public void run() {

                // TODO Auto-generated method stub

                int flag = JOptionPane.showConfirmDialog(null, "正在收集资源...\n如果觉得时间过长,点击'是'提前中断", "是否中断",

                        JOptionPane.YES_NO_OPTION);

                if (flag == JOptionPane.YES_OPTION) {

                    ifToBreak = true;

                }

            }

        }).start();

        // 周期查询文档是否加载完或者用户选择中断

        String js = "let count=0;document.querySelectorAll('.g-bdc > div:nth-child(3) > div.m-filecnt.m-filecnt-1 > ul').forEach(function(e,i){count+=e.children.length});return count;";

        while (true) {

            long countFromJs = (long) ((JavascriptExecutor) driver).executeScript(js);

            System.out.println("已收集到资源数 : " + countFromJs);

            if (Math.abs((int) countFromJs - givenNumber) < 5 || ifToBreak) {

                break;

            }

            try {

                Thread.sleep(1600);

            } catch (InterruptedException e) {

                // TODO Auto-generated catch block

                e.printStackTrace();

            }

        }

        timer.cancel();

        for (WebElement element : driver.findElements(

                By.cssSelector("div.ztag > div.m-filecnt.m-filecnt-1 > ul > li > a > div > div > img.realimg"))) {

            picUrl = element.getAttribute("src").split("\\?")[0];

            System.out.println(picUrl);

            urlList.add(picUrl);

            urlCount++;

        }

        System.out.println("爬取到" + givenNumber + "篇文章中的" + urlCount + "张图的url");

        int flag = JOptionPane.showConfirmDialog(null, "核对url,是否开始下载?", "是否继续", JOptionPane.YES_NO_OPTION);

        if (flag == JOptionPane.NO_OPTION) {

            System.exit(0);

        } else if (flag == JOptionPane.YES_OPTION) {

            System.out.println("正在下载.....");

            try {

                Runtime.getRuntime().exec("explorer " + downloadToDir);

            } catch (IOException e) {

                // TODO Auto-generated catch block

                e.printStackTrace();

            }

            System.out.println("完成,成功下载了" + DownloadFromUrlList3.download(urlList, downloadToDir) + "张图");

            System.out.println("失败数:" + DownloadFromUrlList3.errCount);

        }

        // JOptionPane.showMessageDialog(null, "浏览器可以关闭了吗?");

        driver.quit();

        System.exit(0);

    }

}

DownloadFromUrlList3.java

package test;

import java.io.BufferedInputStream;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.OutputStream;

import java.net.HttpURLConnection;

import java.net.URL;

import java.util.ArrayList;

public class DownloadFromUrlList3 {

    static private int count = 0;

    static ThreadGroup myThreadGroup = new ThreadGroup("myGroup");

    static picThread myThread;

    public static int errCount = 0;

    static long timeOut = 8 * 1000;

    public static synchronized int updateCount() {

        count++;

        return count;

    };

    public static int download(ArrayList<String> urlList, String dirPath) {

        if (new File(dirPath).exists()) {

            if (!new File(dirPath).isDirectory()) {

                System.out.println("ERROR!! THE PATH GIVEN ISN'T A DIRECTORY");

                return 0;

            }

        } else {

            new File(dirPath).mkdir();

        }

        for (String urlstr : urlList) {

            String myUrlStr = urlstr;

            myThread = new picThread(myThreadGroup, new Runnable() {

                @Override

                public void run() {

                    // TODO Auto-generated method stub

                    try {

                        URL url = new URL(myUrlStr);

                        BufferedInputStream is;

                        try {

                            // 网络流量一定要用高效的buffered

                            is = new BufferedInputStream(url.openStream());

                        } catch (Exception e) {

                            // TODO Auto-generated catch block

                            System.out.println("发现一个url资源出错:\n" + myUrlStr);

                            errCount++;

                            return;

                        }

                        String extention = "." + HttpURLConnection.guessContentTypeFromStream(is).split("/")[1];

                        File file = new File(dirPath + updateCount() + extention);

                        ((picThread) Thread.currentThread()).file = file;

                        file.createNewFile();

                        OutputStream os = new FileOutputStream(file);

                        int len;

                        byte[] buffer = new byte[1024];

                        while ((len = is.read(buffer)) != -1) {

                            os.write(buffer, 0, len);

                        }

                        is.close();

                        os.close();

                    } catch (IOException e) {

                        // TODO Auto-generated catch block

                        e.printStackTrace();

                    }

                }

            });

            myThread.urlStr = myUrlStr;

            myThread.start();

            // threadList.add(myThread);

            try {

                myThread.join(timeOut);

            } catch (InterruptedException e) {

                // TODO Auto-generated catch block

                e.printStackTrace();

            }

        }

        /*

         * for (Thread iThread : threadList) { try { // 等待所有线程执行完毕 iThread.join(1000 *

         * 15); } catch (InterruptedException e) { e.printStackTrace(); } }

         */

        try {

            Thread.sleep(1024);

        } catch (InterruptedException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

        }

        Thread[] activeThreads = new Thread[myThreadGroup.activeCount()];

        myThreadGroup.enumerate(activeThreads);

        for (Thread thread : activeThreads) {

            System.out.println("一张图片下载超时 :" + ((picThread) thread).file.getName() + "\n" + ((picThread) thread).urlStr);

            errCount++;

            // ((picThread) thread).file.delete();

        }

        return count - myThreadGroup.activeCount();

    }

    // 使用示例:

    public static void main(String[] args) {

        ArrayList<String> list = new ArrayList<String>();

        list.add(

                "http://imglf1.nosdn.127.net/img/WnhZUEZYSVNPd1lsd1doNjVPSWVkREFwNTRrNTdUM2tQYkk2bHVabHpIYjJOZVFFeEdaUWRnPT0.jpg");

        list.add("http://imgsrc.baidu.com/imgad/pic/item/b58f8c5494eef01f2c2e59feebfe9925bc317dd6.jpg");

        list.add(

                "http://imglf1.nosdn.127.net/img/WnhZUEZYSVNPd1p2Wk54NlRNMTZKLzJnci9HanJsbUNFUTlJdWdFaDhJQUlQQ3h5Y1kzOFlRPT0.jpg");

        list.add("http://b.hiphotos.baidu.com/image/pic/item/7a899e510fb30f247b237cc9c195d143ac4b03ba.jpg");

        System.out.println("完成,成功下载了:" + download(list, "C:\\Users\\Jim\\Desktop\\testPic\\"));

        System.out.println("失败:" + DownloadFromUrlList3.errCount);

        System.exit(0);

    }

}

class picThread extends Thread {

    public String urlStr = "";

    public File file;

    /*

     * public void setUrl(String url) { urlStr = url; }

     *

     * public String getUrl() { return urlStr; }

     */

    public picThread(ThreadGroup group, Runnable run) {

        super(group, run);

    }

}

selenuim爬虫实战 (下)的更多相关文章

selenuim爬虫实战(日lofter.com)
LOFTER是网易公司2011年8月下旬推出的一款轻博客产品. LOFTER专注于为用户提供简约.易用.有品质.重原创的博客工具.原创社区,以及有品质的手机博客应用. LOFTER首次采用独立域名,口 ...
Python简单网络爬虫实战—下载论文名称，作者信息（下）
在Python简单网络爬虫实战—下载论文名称,作者信息(上)中,学会了get到网页内容以及在谷歌浏览器找到了需要提取的内容的数据结构,接下来记录我是如何找到所有author和title的 1.从sou ...
【图文详解】python爬虫实战——5分钟做个图片自动下载器
python爬虫实战——图片自动下载器之前介绍了那么多基本知识[Python爬虫]入门知识,(没看的先去看!!)大家也估计手痒了.想要实际做个小东西来看看,毕竟: talk is cheap sho ...
Python爬虫实战（4）：豆瓣小组话题数据采集—动态网页
1, 引言注释:上一篇<Python爬虫实战(3):安居客房产经纪人信息采集>,访问的网页是静态网页,有朋友模仿那个实战来采集动态加载豆瓣小组的网页,结果不成功.本篇是针对动态网页的数据 ...
爬虫实战：爬虫之 web 自动化终极杀手 ( 上）
欢迎大家前往腾讯云技术社区,获取更多腾讯海量技术实践干货哦~ 作者:陈象导语: 最近写了好几个简单的爬虫,踩了好几个深坑,在这里总结一下,给大家在编写爬虫时候能给点思路.本次爬虫内容有:静态页面的爬 ...
Pyhton爬虫实战 - 抓取BOSS直聘职位描述和数据清洗
Pyhton爬虫实战 - 抓取BOSS直聘职位描述和数据清洗零.致谢感谢BOSS直聘相对权威的招聘信息,使本人有了这次比较有意思的研究之旅. 由于爬虫持续爬取 www.zhipin.com 网 ...
自己动手，丰衣足食！Python3网络爬虫实战案例
本教程是崔大大的爬虫实战教程的笔记:网易云课堂 Python3+Pip环境配置 Windows下安装Python: http://www.cnblogs.com/0bug/p/8228378.html ...
Python爬虫实战四之抓取淘宝MM照片
原文:Python爬虫实战四之抓取淘宝MM照片其实还有好多,大家可以看 Python爬虫学习系列教程福利啊福利,本次为大家带来的项目是抓取淘宝MM照片并保存起来,大家有没有很激动呢? 本篇目标 1. ...
第三百三十节，web爬虫讲解2—urllib库爬虫—实战爬取搜狗微信公众号—抓包软件安装Fiddler4讲解
第三百三十节,web爬虫讲解2—urllib库爬虫—实战爬取搜狗微信公众号—抓包软件安装Fiddler4讲解封装模块 #!/usr/bin/env python # -*- coding: utf- ...

随机推荐

在 WF 4 中编写自定义控制流活动
在 WF 4 中编写自定义控制流活动 Leon Welicki 控制流是指组织和执行程序中各个指令的方法. 在 Windows Workflow Foundation 4 (WF 4) 中,控制流活动 ...
[RSpec] LEVEL 1: INTRODUCTION
Install RSpec: Describe Lets start writing a specification for the Tweet class. Write a describe blo ...
阅读jQuery源代码带给我们的18个惊喜
相信大家都非常熟悉jQuery类库,绝对最受欢迎的JS框架,如果你也有兴趣阅读v源代码的话,或者你也会有同感. 以下便是阅读jQuery源代码后挖掘的18条令人惊奇的信息: 原文:阅读jQuery源代 ...
KineticJS教程（8）
KineticJS教程(8) 作者: ysm 8.动画动画就是一帧帧的画面按照时间间隔显示出来,Kinetic给我们提供了一个舞台对象的onFrame方法,用这个方法可以绑定一个动画方法,我们要显 ...
深入理解Git (三) －微命令上篇
1 git hash-object 曾经讲过Git用Hash值作为Git对象的名字,那么详细是哪个命令呢? 我们能够先改动一个文件: echo "hongchangfirst" & ...
Window上python开发--4.Django的用户登录模块User
Android系统开发交流群:484966421 OSHome. 微信公众号:oshome2015 在搭建站点和web的应用程序时,用户的登录和管理是差点儿是每一个站点都必备的. 今天主要从一个实例了 ...
JDBC-DAO经典模式实现对数据库的增、删、改、查
JDBC(Java Data Base Connection)的作用是连接数据库先看下jdbc连接SQLServer数据库的简单例子代码实现(FirstJDBC): package com.jdb ...
javascript 作用域通俗解释
首先将作用域比喻为一座大楼: 第一层表示当前执行作用域.大楼顶层表示全局作用域. (1)js首先会在当前楼层进行查找变量,如果没有找到,就做电梯往上一层(二层)楼查找. (2)若还是没有找到继续往上查 ...
14-spring学习-变量操作
表达式所有操作都是可以以变量形式出现的. 观察变量的定义: package com.Spring.ELDemo; import org.springframework.expression.Evalu ...
Android酷炫加载进度动画
概述本自定义动画进度酷炫View,是加载进度动画的自定义View,继承于ImageView来实现,主要实现蒙层加载进度的加载进度效果. 支持水平左右加载和垂直上下加载四个方向,同时也支持自定义蒙层进 ...

selenuim爬虫实战 (下)

selenuim爬虫实战 (下)的更多相关文章

随机推荐

热门专题