selenuim爬虫实战 (下)
SuperLOFTERDownloader7.java
package test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.TimeUnit;
import javax.swing.JOptionPane;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.Keys;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.firefox.FirefoxBinary;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxOptions;
import org.openqa.selenium.interactions.Actions;
public class SuperLOFTERDownloader7 {
// static String html;
static int urlCount = 0;
static String username = "fenchenyue";
static String url = "http://" + username + ".lofter.com/view";
static String picUrl;
static boolean ifToBreak = false;
static ArrayList<String> urlList = new ArrayList<String>();
// 注意:目录路径必须以"\"结尾
static String downloadToDir = "E:\\爬虫\\妹纸2\\";
public static void main(String[] args) {
System.setProperty("webdriver.gecko.driver", "C:\\Users\\Jim\\Desktop\\GeckoDriver\\geckodriver.exe");
FirefoxOptions options = new FirefoxOptions();
// 启动配置"不加载图片"
options.addPreference("permissions.default.image", 2);
// 启动参数"无界面"
FirefoxBinary myBinary = new FirefoxBinary();
myBinary.addCommandLineOptions("--headless");
options.setBinary(myBinary);
FirefoxDriver driver = new FirefoxDriver(options);
driver.manage().timeouts().implicitlyWait(16, TimeUnit.SECONDS); // 隐式等待
driver.get(url);
// ((JavascriptExecutor)
// driver).executeScript("document.querySelector('.m-txtsch').style.display=\"inline\"");
int givenNumber = Integer.parseInt(driver.findElementByCssSelector(
"body > div.g-bdfull.g-bdfull-show.ztag > div.g-bdc.ztag > div.m-fbar.f-cb > div.schbtn.f-cb > div:nth-child(1) > div > div.txt > a.ztag.currt > span")
.getAttribute("innerHTML"));
Actions action = new Actions(driver);
Timer timer = new Timer();
timer.schedule(new TimerTask() {
@Override
public void run() {
// TODO Auto-generated method stub
action.sendKeys(/* driver.findElement(By.cssSelector("body")), */ Keys.END).perform();
}
}, 1, 700);
/*
* WebDriverWait wait = new WebDriverWait(driver, 256, 2048);
* wait.until(ExpectedConditions .numberOfElementsToBe(By.
* cssSelector("div.ztag > div.m-filecnt.m-filecnt-1 > ul > li"), number));
*/
new Thread(new Runnable() {
@Override
public void run() {
// TODO Auto-generated method stub
int flag = JOptionPane.showConfirmDialog(null, "正在收集资源...\n如果觉得时间过长,点击'是'提前中断", "是否中断",
JOptionPane.YES_NO_OPTION);
if (flag == JOptionPane.YES_OPTION) {
ifToBreak = true;
}
}
}).start();
// 周期查询文档是否加载完或者用户选择中断
String js = "let count=0;document.querySelectorAll('.g-bdc > div:nth-child(3) > div.m-filecnt.m-filecnt-1 > ul').forEach(function(e,i){count+=e.children.length});return count;";
while (true) {
long countFromJs = (long) ((JavascriptExecutor) driver).executeScript(js);
System.out.println("已收集到资源数 : " + countFromJs);
if (Math.abs((int) countFromJs - givenNumber) < 5 || ifToBreak) {
break;
}
try {
Thread.sleep(1600);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
timer.cancel();
for (WebElement element : driver.findElements(
By.cssSelector("div.ztag > div.m-filecnt.m-filecnt-1 > ul > li > a > div > div > img.realimg"))) {
picUrl = element.getAttribute("src").split("\\?")[0];
System.out.println(picUrl);
urlList.add(picUrl);
urlCount++;
}
System.out.println("爬取到" + givenNumber + "篇文章中的" + urlCount + "张图的url");
int flag = JOptionPane.showConfirmDialog(null, "核对url,是否开始下载?", "是否继续", JOptionPane.YES_NO_OPTION);
if (flag == JOptionPane.NO_OPTION) {
System.exit(0);
} else if (flag == JOptionPane.YES_OPTION) {
System.out.println("正在下载.....");
try {
Runtime.getRuntime().exec("explorer " + downloadToDir);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("完成,成功下载了" + DownloadFromUrlList3.download(urlList, downloadToDir) + "张图");
System.out.println("失败数:" + DownloadFromUrlList3.errCount);
}
// JOptionPane.showMessageDialog(null, "浏览器可以关闭了吗?");
driver.quit();
System.exit(0);
}
}
DownloadFromUrlList3.java
package test;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
public class DownloadFromUrlList3 {
static private int count = 0;
static ThreadGroup myThreadGroup = new ThreadGroup("myGroup");
static picThread myThread;
public static int errCount = 0;
static long timeOut = 8 * 1000;
public static synchronized int updateCount() {
count++;
return count;
};
public static int download(ArrayList<String> urlList, String dirPath) {
if (new File(dirPath).exists()) {
if (!new File(dirPath).isDirectory()) {
System.out.println("ERROR!! THE PATH GIVEN ISN'T A DIRECTORY");
return 0;
}
} else {
new File(dirPath).mkdir();
}
for (String urlstr : urlList) {
String myUrlStr = urlstr;
myThread = new picThread(myThreadGroup, new Runnable() {
@Override
public void run() {
// TODO Auto-generated method stub
try {
URL url = new URL(myUrlStr);
BufferedInputStream is;
try {
// 网络流量一定要用高效的buffered
is = new BufferedInputStream(url.openStream());
} catch (Exception e) {
// TODO Auto-generated catch block
System.out.println("发现一个url资源出错:\n" + myUrlStr);
errCount++;
return;
}
String extention = "." + HttpURLConnection.guessContentTypeFromStream(is).split("/")[1];
File file = new File(dirPath + updateCount() + extention);
((picThread) Thread.currentThread()).file = file;
file.createNewFile();
OutputStream os = new FileOutputStream(file);
int len;
byte[] buffer = new byte[1024];
while ((len = is.read(buffer)) != -1) {
os.write(buffer, 0, len);
}
is.close();
os.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
});
myThread.urlStr = myUrlStr;
myThread.start();
// threadList.add(myThread);
try {
myThread.join(timeOut);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/*
* for (Thread iThread : threadList) { try { // 等待所有线程执行完毕 iThread.join(1000 *
* 15); } catch (InterruptedException e) { e.printStackTrace(); } }
*/
try {
Thread.sleep(1024);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Thread[] activeThreads = new Thread[myThreadGroup.activeCount()];
myThreadGroup.enumerate(activeThreads);
for (Thread thread : activeThreads) {
System.out.println("一张图片下载超时 :" + ((picThread) thread).file.getName() + "\n" + ((picThread) thread).urlStr);
errCount++;
// ((picThread) thread).file.delete();
}
return count - myThreadGroup.activeCount();
}
// 使用示例:
public static void main(String[] args) {
ArrayList<String> list = new ArrayList<String>();
list.add(
"http://imglf1.nosdn.127.net/img/WnhZUEZYSVNPd1lsd1doNjVPSWVkREFwNTRrNTdUM2tQYkk2bHVabHpIYjJOZVFFeEdaUWRnPT0.jpg");
list.add("http://imgsrc.baidu.com/imgad/pic/item/b58f8c5494eef01f2c2e59feebfe9925bc317dd6.jpg");
list.add(
"http://imglf1.nosdn.127.net/img/WnhZUEZYSVNPd1p2Wk54NlRNMTZKLzJnci9HanJsbUNFUTlJdWdFaDhJQUlQQ3h5Y1kzOFlRPT0.jpg");
list.add("http://b.hiphotos.baidu.com/image/pic/item/7a899e510fb30f247b237cc9c195d143ac4b03ba.jpg");
System.out.println("完成,成功下载了:" + download(list, "C:\\Users\\Jim\\Desktop\\testPic\\"));
System.out.println("失败:" + DownloadFromUrlList3.errCount);
System.exit(0);
}
}
class picThread extends Thread {
public String urlStr = "";
public File file;
/*
* public void setUrl(String url) { urlStr = url; }
*
* public String getUrl() { return urlStr; }
*/
public picThread(ThreadGroup group, Runnable run) {
super(group, run);
}
}
selenuim爬虫实战 (下)的更多相关文章
- selenuim爬虫实战(日lofter.com)
LOFTER是网易公司2011年8月下旬推出的一款轻博客产品. LOFTER专注于为用户提供简约.易用.有品质.重原创的博客工具.原创社区,以及有品质的手机博客应用. LOFTER首次采用独立域名,口 ...
- Python简单网络爬虫实战—下载论文名称,作者信息(下)
在Python简单网络爬虫实战—下载论文名称,作者信息(上)中,学会了get到网页内容以及在谷歌浏览器找到了需要提取的内容的数据结构,接下来记录我是如何找到所有author和title的 1.从sou ...
- 【图文详解】python爬虫实战——5分钟做个图片自动下载器
python爬虫实战——图片自动下载器 之前介绍了那么多基本知识[Python爬虫]入门知识,(没看的先去看!!)大家也估计手痒了.想要实际做个小东西来看看,毕竟: talk is cheap sho ...
- Python爬虫实战(4):豆瓣小组话题数据采集—动态网页
1, 引言 注释:上一篇<Python爬虫实战(3):安居客房产经纪人信息采集>,访问的网页是静态网页,有朋友模仿那个实战来采集动态加载豆瓣小组的网页,结果不成功.本篇是针对动态网页的数据 ...
- 爬虫实战:爬虫之 web 自动化终极杀手 ( 上)
欢迎大家前往腾讯云技术社区,获取更多腾讯海量技术实践干货哦~ 作者:陈象 导语: 最近写了好几个简单的爬虫,踩了好几个深坑,在这里总结一下,给大家在编写爬虫时候能给点思路.本次爬虫内容有:静态页面的爬 ...
- Pyhton爬虫实战 - 抓取BOSS直聘职位描述 和 数据清洗
Pyhton爬虫实战 - 抓取BOSS直聘职位描述 和 数据清洗 零.致谢 感谢BOSS直聘相对权威的招聘信息,使本人有了这次比较有意思的研究之旅. 由于爬虫持续爬取 www.zhipin.com 网 ...
- 自己动手,丰衣足食!Python3网络爬虫实战案例
本教程是崔大大的爬虫实战教程的笔记:网易云课堂 Python3+Pip环境配置 Windows下安装Python: http://www.cnblogs.com/0bug/p/8228378.html ...
- Python爬虫实战四之抓取淘宝MM照片
原文:Python爬虫实战四之抓取淘宝MM照片其实还有好多,大家可以看 Python爬虫学习系列教程 福利啊福利,本次为大家带来的项目是抓取淘宝MM照片并保存起来,大家有没有很激动呢? 本篇目标 1. ...
- 第三百三十节,web爬虫讲解2—urllib库爬虫—实战爬取搜狗微信公众号—抓包软件安装Fiddler4讲解
第三百三十节,web爬虫讲解2—urllib库爬虫—实战爬取搜狗微信公众号—抓包软件安装Fiddler4讲解 封装模块 #!/usr/bin/env python # -*- coding: utf- ...
随机推荐
- Python Socket 编程——聊天室演示样例程序
上一篇 我们学习了简单的 Python TCP Socket 编程,通过分别写服务端和client的代码了解主要的 Python Socket 编程模型.本文再通过一个样例来加强一下对 Socket ...
- 如何在代码中设置以dp为单位的长度
获取当前屏幕的密度系数 ,并设置控件以dp为单位的长宽 float density = getResources().getDisplayMetrics().density; params = ...
- java防止sql注入
public final static String filterSQLInjection(String s) { if (s == null || "".equals(s)) { ...
- Orcla 数据库复习2 --子查询和表连接
子查询和表连接 ①.查询挣钱最多的人的名字 SELECT ename,sal FROM emp WHERE sal=(SELECT MAX(sal) FROM emp); ②.查询有哪些人的工 ...
- pdf+iphone+wechat
可能很多人要问,为啥标题取这个名字. 因为今天在这个上面踩了太多坑.. 我们的需求其实很简单.做一个页面,把pdf文档嵌进去,在线显示. 如此需求,放在PC上chrome浏览器,一个embed标签就搞 ...
- css background-position结合disaply:inline-block使用
$(".icon-a").on('click', function (e) { if ($(this).next().css('display') == "none&qu ...
- spring mvc 返回json的配置
转载自:http://my.oschina.net/haopeng/blog/324934 springMVC-servlet.xml 配置 1 2 3 4 5 6 7 8 9 10 11 12 13 ...
- window.open()函数
http://hi.baidu.com/gagahjt/blog/item/7b76e0dee61b20aecd11661c.html open函数详解: window.open("sUrl ...
- ulipad python相关设置
1)在ulipad下编写的python raw_input/input没有办法正确输出?(获取用户输入) 菜单栏->Python->设置参数->Parameters 处填入 -u
- B/S打印解决方案参考
使用Lodop 插件,该插件占用8000端口,未使用过,仅知依赖浏览器打印 http://blog.csdn.net/harderxin/article/details/17262945 强大的web ...