python+selenium爬取关键字搜索google图片

 # -*- coding: utf-8 -*-

 import json

 import os

 import time

 from multiprocessing import Pool

 import multiprocessing

 import requests

 from selenium import webdriver

 def get_image_links(keyword, num_requested = 1000):

     """get image links with selenium

     """

     number_of_scrolls = int(num_requested/400) + 1

     img_urls = set()#设置为集合，自动去除重复链接

     chrome_options = webdriver.ChromeOptions()

     # chrome_options.add_argument('--headless')#设置无头浏览器

     # chrome_options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"')

     # chrome_options.add_argument("lang=en_US")#设置语言

     # prefs = {"profile.managed_default_content_settings.images":2}

     # chrome_options.add_experimental_option("prefs",prefs)#配置不加载图片

     driver = webdriver.Chrome(chrome_options=chrome_options)

     driver.maximize_window()

     search_query = keyword

     url = "https://www.google.com/search?q="+search_query+"&source=lnms&tbm=isch"

     driver.get(url)

     for _ in range(number_of_scrolls):

         for i in range(5):

             # multiple scrolls needed to show all 400 images

             driver.execute_script("window.scrollBy(0, 100000)")

             time.sleep(1)

         time.sleep(5)#等待页面刷新，否则有可能元素不可见

         try:

             # driver.find_element_by_xpath("//input[@value='Show more results']").click()＃浏览器的中英文版本不同

             driver.find_element_by_xpath("//input[@value='显示更多结果']").click()

         except Exception as e:

             print("reach the end of page ")

             break

     # with open('page.html','w') as f:

     #     f.write(driver.page_source)

     imgs = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]')#模糊定位

     for i,img in enumerate(imgs):

         img_url = json.loads(img.get_attribute('innerHTML'))["ou"]

         img_urls.add(img_url)

     driver.quit()

     print("finish getting all image urls!")

     return img_urls

 def download(urls,download_dir):

     '''download images

     '''

     print("start downloading images!")

     for url in urls:

         filename=os.path.join(download_dir,os.path.basename(url))

         try:

             r = requests.get(url, stream=True, timeout=60)

             r.raise_for_status()

             with open(filename, 'wb') as f:

                 f.write(r.content)

         except Exception:

             continue

     print("finish downloading images!")

 keywords = ['girl','boy']

 download_dir = './images/'

 download_dirs = []

 for keyword in keywords:

     path = os.path.join(download_dir,keyword)

     download_dirs.append(path)

     if not os.path.exists(path):

         os.makedirs(path)

 # for keyword in main_keywords:

 #     image_urls = get_image_links(keyword)

 #     download(image_urls,download_dir)

 ###################################

 # get image links/MultiProcess

 ###################################

 img_urls=[]

 multiprocessing.freeze_support()

 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself

 for keyword in keywords:

     img_urls.append(p.apply_async(get_image_links, (keyword,)))

 #img_urls:[<multiprocessing.pool.ApplyResult object at 0x7f536925fcc0>, <multiprocessing.pool.ApplyResult object at 0x7f536925fd68>]

 for i,urls in enumerate(img_urls):

     img_urls[i]=urls.get()

 p.close()

 p.join()

 # # ###################################

 # # # download images/MultiProcess

 # # ###################################

 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself

 for i,urls in enumerate(img_urls):

     p.apply_async(download, [urls,download_dirs[i]])

 p.close()

 p.join()

python+selenium爬取关键字搜索google图片的更多相关文章

Python+Selenium爬取动态加载页面（2）
注: 上一篇<Python+Selenium爬取动态加载页面(1)>讲了基本地如何获取动态页面的数据,这里再讲一个稍微复杂一点的数据获取全国水雨情网.数据的获取过程跟人手动获取过程类似,所 ...
Python+Selenium爬取动态加载页面（1）
注: 最近有一小任务,需要收集水质和水雨信息,找了两个网站:国家地表水水质自动监测实时数据发布系统和全国水雨情网.由于这两个网站的数据都是动态加载出来的,所以我用了Selenium来完成我的数据获取. ...
Python+selenium爬取智联招聘的职位信息
整个爬虫是基于selenium和Python来运行的,运行需要的包 mysql,matplotlib,selenium 需要安装selenium火狐浏览器驱动,百度的搜寻. 整个爬虫是模块化组织的,不 ...
Python 爬虫爬取煎蛋网图片
今天, 试着爬取了煎蛋网的图片. 用到的包: urllib.request os 分别使用几个函数,来控制下载的图片的页数,获取图片的网页,获取网页页数以及保存图片到本地.过程简单清晰明了直接上源代 ...
python+selenium爬取百度文库不能下载的word文档
有些时候我们需要用到百度文库的某些文章时,却发现需要会员才能下载,很难受,其实我们可以通过爬虫的方式来获取到我们所需要的文本. 工具:python3.7+selenium+任意一款编辑器前期准备:可 ...
爬虫之selenium爬取斗鱼主播图片
这是我GitHub上简单的selenium介绍与简单使用:https://github.com/bwyt/spider/tree/master/selenium%E5%9F%BA%E7%A1%80 & ...
python selenium 爬取淘宝
# -*- coding:utf-8 -*- # author : yesehngbao # time:2018/3/29 import re import pymongo from lxml imp ...
python selenium爬取QQ空间方法
from selenium import webdriver import time # 打开浏览器 dr = webdriver.Chrome() # 打开某个网址 dr.get('https:// ...
python selenium爬取自如租房数据保存到TXT文件
# -*- coding: utf-8 -*-"""Created on Fri Aug 31 2018 @author: chenlinlab"" ...

随机推荐

【转】使用eclipse的todo标签管理任务
Eclipse中的一些特殊的注释技术包括: 1. // TODO —— 表示尚未完成的待办事项. 2. // XXX —— 表示被注释的代码虽然实现了功能,但是实现方案有待商榷 ...
[Web 安全] WASC 和 OWASP两个web安全方面组织机构介绍
copy from : http://blog.sina.com.cn/s/blog_70b7aab9010126mn.html WASC 和 OWASP.这两个组织在呼吁企业加强应用安全意识和指导 ...
C# default(T)关键字
C#关键词default函数,default(T)可以得到该类型的默认值. C#在类初始化时,会给未显示赋值的字段.属性赋上默认值,但是值变量却不会. 值变量可以使用默认构造函数赋值,或者使用defa ...
docker search mysql Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?
1.docker search mysql 报错 [root@localhost usr]# docker search mysqlCannot connect to the Docker daemo ...
html网页调用本地exe程序的实现方法（转）
https://blog.csdn.net/ilovecr7/article/details/46803711 最近在做一个项目,要什么网页里调exe...开始以为不能实现,后来想想很多就跟淘宝网页上 ...
python自动化测试之连接几组测试包实例
python自动化测试之连接几组测试包实例本文实例讲述了python自动化测试之连接几组测试包的方法,分享给大家供大家参考.具体方法如下: 具体代码如下: class RomanNumera ...
LeetCode_437. Path Sum III
437. Path Sum III Easy You are given a binary tree in which each node contains an integer value. Fin ...
【mysql 默认密码】ubuntu 上初次启动mysql 默认密码
对于debian系的系统,mysql初始默认密码 cat /etc/mysql/debian.cnf
编写vscode插件
一.参考学习 https://www.cnblogs.com/liuxianan/p/vscode-plugin-publish.html https://code.visualstudio.com/ ...
Java之整数运算
Java的整数运算遵循四则运算规则,可以使用任意嵌套的小括号.四则运算规则和初等数学一致.例如: public class Main { public static void main(String[ ...

python+selenium爬取关键字搜索google图片

python+selenium爬取关键字搜索google图片的更多相关文章

随机推荐

热门专题