SuperLOFTERDownloader7.java

package test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.TimeUnit; import javax.swing.JOptionPane; import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.Keys;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxBinary;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxOptions;
import org.openqa.selenium.interactions.Actions; public class SuperLOFTERDownloader7 {
// static String html;
static int urlCount = 0;
static String username = "fenchenyue";
static String url = "http://" + username + ".lofter.com/view";
static String picUrl;
static boolean ifToBreak = false;
static ArrayList<String> urlList = new ArrayList<String>();
// 注意:目录路径必须以"\"结尾
static String downloadToDir = "E:\\爬虫\\妹纸2\\"; public static void main(String[] args) {
System.setProperty("webdriver.gecko.driver", "C:\\Users\\Jim\\Desktop\\GeckoDriver\\geckodriver.exe"); FirefoxOptions options = new FirefoxOptions(); // 启动配置"不加载图片"
options.addPreference("permissions.default.image", 2); // 启动参数"无界面"
FirefoxBinary myBinary = new FirefoxBinary();
myBinary.addCommandLineOptions("--headless");
options.setBinary(myBinary); FirefoxDriver driver = new FirefoxDriver(options);
driver.manage().timeouts().implicitlyWait(16, TimeUnit.SECONDS); // 隐式等待
driver.get(url); // ((JavascriptExecutor)
// driver).executeScript("document.querySelector('.m-txtsch').style.display=\"inline\""); int givenNumber = Integer.parseInt(driver.findElementByCssSelector(
"body > div.g-bdfull.g-bdfull-show.ztag > div.g-bdc.ztag > div.m-fbar.f-cb > div.schbtn.f-cb > div:nth-child(1) > div > div.txt > a.ztag.currt > span")
.getAttribute("innerHTML")); Actions action = new Actions(driver); Timer timer = new Timer();
timer.schedule(new TimerTask() { @Override
public void run() {
// TODO Auto-generated method stub
action.sendKeys(/* driver.findElement(By.cssSelector("body")), */ Keys.END).perform();
}
}, 1, 700); /*
* WebDriverWait wait = new WebDriverWait(driver, 256, 2048);
* wait.until(ExpectedConditions .numberOfElementsToBe(By.
* cssSelector("div.ztag > div.m-filecnt.m-filecnt-1 > ul > li"), number));
*/ new Thread(new Runnable() { @Override
public void run() {
// TODO Auto-generated method stub
int flag = JOptionPane.showConfirmDialog(null, "正在收集资源...\n如果觉得时间过长,点击'是'提前中断", "是否中断",
JOptionPane.YES_NO_OPTION);
if (flag == JOptionPane.YES_OPTION) {
ifToBreak = true;
}
}
}).start(); // 周期查询文档是否加载完或者用户选择中断
String js = "let count=0;document.querySelectorAll('.g-bdc > div:nth-child(3) > div.m-filecnt.m-filecnt-1 > ul').forEach(function(e,i){count+=e.children.length});return count;";
while (true) { long countFromJs = (long) ((JavascriptExecutor) driver).executeScript(js);
System.out.println("已收集到资源数 : " + countFromJs);
if (Math.abs((int) countFromJs - givenNumber) < 5 || ifToBreak) {
break;
}
try {
Thread.sleep(1600);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
timer.cancel(); for (WebElement element : driver.findElements(
By.cssSelector("div.ztag > div.m-filecnt.m-filecnt-1 > ul > li > a > div > div > img.realimg"))) {
picUrl = element.getAttribute("src").split("\\?")[0];
System.out.println(picUrl);
urlList.add(picUrl);
urlCount++; } System.out.println("爬取到" + givenNumber + "篇文章中的" + urlCount + "张图的url"); int flag = JOptionPane.showConfirmDialog(null, "核对url,是否开始下载?", "是否继续", JOptionPane.YES_NO_OPTION);
if (flag == JOptionPane.NO_OPTION) {
System.exit(0);
} else if (flag == JOptionPane.YES_OPTION) {
System.out.println("正在下载.....");
try {
Runtime.getRuntime().exec("explorer " + downloadToDir);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("完成,成功下载了" + DownloadFromUrlList3.download(urlList, downloadToDir) + "张图");
System.out.println("失败数:" + DownloadFromUrlList3.errCount);
}
// JOptionPane.showMessageDialog(null, "浏览器可以关闭了吗?");
driver.quit();
System.exit(0);
}
}

DownloadFromUrlList3.java

package test;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList; public class DownloadFromUrlList3 {
static private int count = 0;
static ThreadGroup myThreadGroup = new ThreadGroup("myGroup");
static picThread myThread;
public static int errCount = 0;
static long timeOut = 8 * 1000; public static synchronized int updateCount() {
count++;
return count;
}; public static int download(ArrayList<String> urlList, String dirPath) {
if (new File(dirPath).exists()) {
if (!new File(dirPath).isDirectory()) {
System.out.println("ERROR!! THE PATH GIVEN ISN'T A DIRECTORY");
return 0;
}
} else {
new File(dirPath).mkdir();
} for (String urlstr : urlList) {
String myUrlStr = urlstr;
myThread = new picThread(myThreadGroup, new Runnable() { @Override
public void run() {
// TODO Auto-generated method stub
try {
URL url = new URL(myUrlStr);
BufferedInputStream is;
try {
// 网络流量一定要用高效的buffered
is = new BufferedInputStream(url.openStream());
} catch (Exception e) {
// TODO Auto-generated catch block
System.out.println("发现一个url资源出错:\n" + myUrlStr);
errCount++;
return;
}
String extention = "." + HttpURLConnection.guessContentTypeFromStream(is).split("/")[1];
File file = new File(dirPath + updateCount() + extention);
((picThread) Thread.currentThread()).file = file;
file.createNewFile();
OutputStream os = new FileOutputStream(file);
int len;
byte[] buffer = new byte[1024];
while ((len = is.read(buffer)) != -1) {
os.write(buffer, 0, len);
}
is.close();
os.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} }
});
myThread.urlStr = myUrlStr;
myThread.start();
// threadList.add(myThread);
try {
myThread.join(timeOut);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} /*
* for (Thread iThread : threadList) { try { // 等待所有线程执行完毕 iThread.join(1000 *
* 15); } catch (InterruptedException e) { e.printStackTrace(); } }
*/
try {
Thread.sleep(1024);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Thread[] activeThreads = new Thread[myThreadGroup.activeCount()];
myThreadGroup.enumerate(activeThreads);
for (Thread thread : activeThreads) {
System.out.println("一张图片下载超时 :" + ((picThread) thread).file.getName() + "\n" + ((picThread) thread).urlStr);
errCount++;
// ((picThread) thread).file.delete();
}
return count - myThreadGroup.activeCount();
} // 使用示例:
public static void main(String[] args) {
ArrayList<String> list = new ArrayList<String>();
list.add(
"http://imglf1.nosdn.127.net/img/WnhZUEZYSVNPd1lsd1doNjVPSWVkREFwNTRrNTdUM2tQYkk2bHVabHpIYjJOZVFFeEdaUWRnPT0.jpg");
list.add("http://imgsrc.baidu.com/imgad/pic/item/b58f8c5494eef01f2c2e59feebfe9925bc317dd6.jpg");
list.add(
"http://imglf1.nosdn.127.net/img/WnhZUEZYSVNPd1p2Wk54NlRNMTZKLzJnci9HanJsbUNFUTlJdWdFaDhJQUlQQ3h5Y1kzOFlRPT0.jpg");
list.add("http://b.hiphotos.baidu.com/image/pic/item/7a899e510fb30f247b237cc9c195d143ac4b03ba.jpg");
System.out.println("完成,成功下载了:" + download(list, "C:\\Users\\Jim\\Desktop\\testPic\\"));
System.out.println("失败:" + DownloadFromUrlList3.errCount);
System.exit(0);
} } class picThread extends Thread {
public String urlStr = "";
public File file;
/*
* public void setUrl(String url) { urlStr = url; }
*
* public String getUrl() { return urlStr; }
*/ public picThread(ThreadGroup group, Runnable run) {
super(group, run);
}
}

selenuim爬虫实战 (下)的更多相关文章

  1. selenuim爬虫实战(日lofter.com)

    LOFTER是网易公司2011年8月下旬推出的一款轻博客产品. LOFTER专注于为用户提供简约.易用.有品质.重原创的博客工具.原创社区,以及有品质的手机博客应用. LOFTER首次采用独立域名,口 ...

  2. Python简单网络爬虫实战—下载论文名称,作者信息(下)

    在Python简单网络爬虫实战—下载论文名称,作者信息(上)中,学会了get到网页内容以及在谷歌浏览器找到了需要提取的内容的数据结构,接下来记录我是如何找到所有author和title的 1.从sou ...

  3. 【图文详解】python爬虫实战——5分钟做个图片自动下载器

    python爬虫实战——图片自动下载器 之前介绍了那么多基本知识[Python爬虫]入门知识,(没看的先去看!!)大家也估计手痒了.想要实际做个小东西来看看,毕竟: talk is cheap sho ...

  4. Python爬虫实战(4):豆瓣小组话题数据采集—动态网页

    1, 引言 注释:上一篇<Python爬虫实战(3):安居客房产经纪人信息采集>,访问的网页是静态网页,有朋友模仿那个实战来采集动态加载豆瓣小组的网页,结果不成功.本篇是针对动态网页的数据 ...

  5. 爬虫实战:爬虫之 web 自动化终极杀手 ( 上)

    欢迎大家前往腾讯云技术社区,获取更多腾讯海量技术实践干货哦~ 作者:陈象 导语: 最近写了好几个简单的爬虫,踩了好几个深坑,在这里总结一下,给大家在编写爬虫时候能给点思路.本次爬虫内容有:静态页面的爬 ...

  6. Pyhton爬虫实战 - 抓取BOSS直聘职位描述 和 数据清洗

    Pyhton爬虫实战 - 抓取BOSS直聘职位描述 和 数据清洗 零.致谢 感谢BOSS直聘相对权威的招聘信息,使本人有了这次比较有意思的研究之旅. 由于爬虫持续爬取 www.zhipin.com 网 ...

  7. 自己动手,丰衣足食!Python3网络爬虫实战案例

    本教程是崔大大的爬虫实战教程的笔记:网易云课堂 Python3+Pip环境配置 Windows下安装Python: http://www.cnblogs.com/0bug/p/8228378.html ...

  8. Python爬虫实战四之抓取淘宝MM照片

    原文:Python爬虫实战四之抓取淘宝MM照片其实还有好多,大家可以看 Python爬虫学习系列教程 福利啊福利,本次为大家带来的项目是抓取淘宝MM照片并保存起来,大家有没有很激动呢? 本篇目标 1. ...

  9. 第三百三十节,web爬虫讲解2—urllib库爬虫—实战爬取搜狗微信公众号—抓包软件安装Fiddler4讲解

    第三百三十节,web爬虫讲解2—urllib库爬虫—实战爬取搜狗微信公众号—抓包软件安装Fiddler4讲解 封装模块 #!/usr/bin/env python # -*- coding: utf- ...

随机推荐

  1. Python Socket 编程——聊天室演示样例程序

    上一篇 我们学习了简单的 Python TCP Socket 编程,通过分别写服务端和client的代码了解主要的 Python Socket 编程模型.本文再通过一个样例来加强一下对 Socket ...

  2. 如何在代码中设置以dp为单位的长度

    获取当前屏幕的密度系数 ,并设置控件以dp为单位的长宽   float density = getResources().getDisplayMetrics().density;   params = ...

  3. java防止sql注入

    public final static String filterSQLInjection(String s) { if (s == null || "".equals(s)) { ...

  4. Orcla 数据库复习2 --子查询和表连接

    子查询和表连接  ①.查询挣钱最多的人的名字  SELECT ename,sal FROM emp  WHERE sal=(SELECT MAX(sal) FROM emp);  ②.查询有哪些人的工 ...

  5. pdf+iphone+wechat

    可能很多人要问,为啥标题取这个名字. 因为今天在这个上面踩了太多坑.. 我们的需求其实很简单.做一个页面,把pdf文档嵌进去,在线显示. 如此需求,放在PC上chrome浏览器,一个embed标签就搞 ...

  6. css background-position结合disaply:inline-block使用

    $(".icon-a").on('click', function (e) { if ($(this).next().css('display') == "none&qu ...

  7. spring mvc 返回json的配置

    转载自:http://my.oschina.net/haopeng/blog/324934 springMVC-servlet.xml 配置 1 2 3 4 5 6 7 8 9 10 11 12 13 ...

  8. window.open()函数

    http://hi.baidu.com/gagahjt/blog/item/7b76e0dee61b20aecd11661c.html open函数详解: window.open("sUrl ...

  9. ulipad python相关设置

    1)在ulipad下编写的python raw_input/input没有办法正确输出?(获取用户输入) 菜单栏->Python->设置参数->Parameters 处填入 -u

  10. B/S打印解决方案参考

    使用Lodop 插件,该插件占用8000端口,未使用过,仅知依赖浏览器打印 http://blog.csdn.net/harderxin/article/details/17262945 强大的web ...