# -*- coding: utf-8 -*-

 import json
import os
import time
from multiprocessing import Pool
import multiprocessing
import requests
from selenium import webdriver def get_image_links(keyword, num_requested = 1000):
"""get image links with selenium
"""
number_of_scrolls = int(num_requested/400) + 1
img_urls = set()#设置为集合,自动去除重复链接
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')#设置无头浏览器
# chrome_options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"')
# chrome_options.add_argument("lang=en_US")#设置语言
# prefs = {"profile.managed_default_content_settings.images":2}
# chrome_options.add_experimental_option("prefs",prefs)#配置不加载图片
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.maximize_window()
search_query = keyword
url = "https://www.google.com/search?q="+search_query+"&source=lnms&tbm=isch"
driver.get(url)
for _ in range(number_of_scrolls):
for i in range(5):
# multiple scrolls needed to show all 400 images
driver.execute_script("window.scrollBy(0, 100000)")
time.sleep(1)
time.sleep(5)#等待页面刷新,否则有可能元素不可见
try:
# driver.find_element_by_xpath("//input[@value='Show more results']").click()#浏览器的中英文版本不同
driver.find_element_by_xpath("//input[@value='显示更多结果']").click()
except Exception as e:
print("reach the end of page ")
break # with open('page.html','w') as f:
# f.write(driver.page_source)
imgs = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]')#模糊定位
for i,img in enumerate(imgs):
img_url = json.loads(img.get_attribute('innerHTML'))["ou"]
img_urls.add(img_url)
driver.quit()
print("finish getting all image urls!") return img_urls def download(urls,download_dir):
'''download images
'''
print("start downloading images!")
for url in urls:
filename=os.path.join(download_dir,os.path.basename(url))
try:
r = requests.get(url, stream=True, timeout=60)
r.raise_for_status()
with open(filename, 'wb') as f:
f.write(r.content)
except Exception:
continue
print("finish downloading images!") keywords = ['girl','boy']
download_dir = './images/'
download_dirs = []
for keyword in keywords:
path = os.path.join(download_dir,keyword)
download_dirs.append(path)
if not os.path.exists(path):
os.makedirs(path) # for keyword in main_keywords:
# image_urls = get_image_links(keyword)
# download(image_urls,download_dir) ###################################
# get image links/MultiProcess
###################################
img_urls=[]
multiprocessing.freeze_support()
p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself
for keyword in keywords:
img_urls.append(p.apply_async(get_image_links, (keyword,)))
#img_urls:[<multiprocessing.pool.ApplyResult object at 0x7f536925fcc0>, <multiprocessing.pool.ApplyResult object at 0x7f536925fd68>]
for i,urls in enumerate(img_urls):
img_urls[i]=urls.get()
p.close()
p.join() # # ###################################
# # # download images/MultiProcess
# # ###################################
p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself
for i,urls in enumerate(img_urls):
p.apply_async(download, [urls,download_dirs[i]])
p.close()
p.join()

python+selenium爬取关键字搜索google图片的更多相关文章

  1. Python+Selenium爬取动态加载页面(2)

    注: 上一篇<Python+Selenium爬取动态加载页面(1)>讲了基本地如何获取动态页面的数据,这里再讲一个稍微复杂一点的数据获取全国水雨情网.数据的获取过程跟人手动获取过程类似,所 ...

  2. Python+Selenium爬取动态加载页面(1)

    注: 最近有一小任务,需要收集水质和水雨信息,找了两个网站:国家地表水水质自动监测实时数据发布系统和全国水雨情网.由于这两个网站的数据都是动态加载出来的,所以我用了Selenium来完成我的数据获取. ...

  3. Python+selenium爬取智联招聘的职位信息

    整个爬虫是基于selenium和Python来运行的,运行需要的包 mysql,matplotlib,selenium 需要安装selenium火狐浏览器驱动,百度的搜寻. 整个爬虫是模块化组织的,不 ...

  4. Python 爬虫 爬取 煎蛋网 图片

    今天, 试着爬取了煎蛋网的图片. 用到的包: urllib.request os 分别使用几个函数,来控制下载的图片的页数,获取图片的网页,获取网页页数以及保存图片到本地.过程简单清晰明了 直接上源代 ...

  5. python+selenium爬取百度文库不能下载的word文档

    有些时候我们需要用到百度文库的某些文章时,却发现需要会员才能下载,很难受,其实我们可以通过爬虫的方式来获取到我们所需要的文本. 工具:python3.7+selenium+任意一款编辑器 前期准备:可 ...

  6. 爬虫之selenium爬取斗鱼主播图片

    这是我GitHub上简单的selenium介绍与简单使用:https://github.com/bwyt/spider/tree/master/selenium%E5%9F%BA%E7%A1%80 & ...

  7. python selenium 爬取淘宝

    # -*- coding:utf-8 -*- # author : yesehngbao # time:2018/3/29 import re import pymongo from lxml imp ...

  8. python selenium爬取QQ空间方法

    from selenium import webdriver import time # 打开浏览器 dr = webdriver.Chrome() # 打开某个网址 dr.get('https:// ...

  9. python selenium爬取自如租房数据保存到TXT文件

    # -*- coding: utf-8 -*-"""Created on Fri Aug 31  2018 @author: chenlinlab"" ...

随机推荐

  1. SubQuery优化

    https://zhuanlan.zhihu.com/p/60380557 子查询,分两种情况, 对于在From中的,称为‘derived table’,这种场景比较简单 对于在select,wher ...

  2. js实现字符串切割并转换成对象格式保存到本地

    // split() 将字符串按照指定的规则分割成字符串数组,并返回此数组(字符串转数组的方法) //分割字符串 var bStr = "www.baidu.con"; var a ...

  3. linux中环境变量和系统加载环境变量的顺序

    一.系统环境变量: /etc/profile :这个文件预设了几个重要的变量,例如PATH, USER, LOGNAME, MAIL, INPUTRC, HOSTNAME, HISTSIZE, uma ...

  4. 品优购商城项目(五)消息中间件 ActiveMQ

    消息中间件用于降低各个项目模块的耦合,适用于不需要等待返回消息才能进入下一个业务环节的模块,以及实时要求性不高的业务模块. 一.JMS JMS(Java Messaging Service)是Java ...

  5. plsql 的三种循环

    set serveroutput on declare pnum ; begin loop dbms_output.put_line(pnum); pnum :; end loop; end; / s ...

  6. jQuery fancy box 移除close "X" button

    version: fancybox 2.1.4 <div id="reloadPagePopup" class="div-fancy-box"> & ...

  7. [LeetCode] 77. Combinations 全组合

    Given two integers n and k, return all possible combinations of k numbers out of 1 ... n. For exampl ...

  8. 大网扫描在发现APT组织方面的一个应用

    如何发现CobalStike服务端? 答: 扫HTTP response header: "HTTP/1.1 404 Not Found" balaba-"Server& ...

  9. ou can mix require and export. You can't mix import and module.exports.

    ou can mix require and export. You can't mix import and module.exports.

  10. Kubernetes 监控方案之 Prometheus Operator(十九)

    目录 一.Prometheus 介绍 1.1.Prometheus 架构 1.2.Prometheus Operator 架构 二.Helm 安装部署 2.1.Helm 客户端安装 2.2.Tille ...