python+selenium爬取关键字搜索google图片

 # -*- coding: utf-8 -*-

 import json

 import os

 import time

 from multiprocessing import Pool

 import multiprocessing

 import requests

 from selenium import webdriver

 def get_image_links(keyword, num_requested = 1000):

     """get image links with selenium

     """

     number_of_scrolls = int(num_requested/400) + 1

     img_urls = set()#设置为集合，自动去除重复链接

     chrome_options = webdriver.ChromeOptions()

     # chrome_options.add_argument('--headless')#设置无头浏览器

     # chrome_options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"')

     # chrome_options.add_argument("lang=en_US")#设置语言

     # prefs = {"profile.managed_default_content_settings.images":2}

     # chrome_options.add_experimental_option("prefs",prefs)#配置不加载图片

     driver = webdriver.Chrome(chrome_options=chrome_options)

     driver.maximize_window()

     search_query = keyword

     url = "https://www.google.com/search?q="+search_query+"&source=lnms&tbm=isch"

     driver.get(url)

     for _ in range(number_of_scrolls):

         for i in range(5):

             # multiple scrolls needed to show all 400 images

             driver.execute_script("window.scrollBy(0, 100000)")

             time.sleep(1)

         time.sleep(5)#等待页面刷新，否则有可能元素不可见

         try:

             # driver.find_element_by_xpath("//input[@value='Show more results']").click()＃浏览器的中英文版本不同

             driver.find_element_by_xpath("//input[@value='显示更多结果']").click()

         except Exception as e:

             print("reach the end of page ")

             break

     # with open('page.html','w') as f:

     #     f.write(driver.page_source)

     imgs = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]')#模糊定位

     for i,img in enumerate(imgs):

         img_url = json.loads(img.get_attribute('innerHTML'))["ou"]

         img_urls.add(img_url)

     driver.quit()

     print("finish getting all image urls!")

     return img_urls

 def download(urls,download_dir):

     '''download images

     '''

     print("start downloading images!")

     for url in urls:

         filename=os.path.join(download_dir,os.path.basename(url))

         try:

             r = requests.get(url, stream=True, timeout=60)

             r.raise_for_status()

             with open(filename, 'wb') as f:

                 f.write(r.content)

         except Exception:

             continue

     print("finish downloading images!")

 keywords = ['girl','boy']

 download_dir = './images/'

 download_dirs = []

 for keyword in keywords:

     path = os.path.join(download_dir,keyword)

     download_dirs.append(path)

     if not os.path.exists(path):

         os.makedirs(path)

 # for keyword in main_keywords:

 #     image_urls = get_image_links(keyword)

 #     download(image_urls,download_dir)

 ###################################

 # get image links/MultiProcess

 ###################################

 img_urls=[]

 multiprocessing.freeze_support()

 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself

 for keyword in keywords:

     img_urls.append(p.apply_async(get_image_links, (keyword,)))

 #img_urls:[<multiprocessing.pool.ApplyResult object at 0x7f536925fcc0>, <multiprocessing.pool.ApplyResult object at 0x7f536925fd68>]

 for i,urls in enumerate(img_urls):

     img_urls[i]=urls.get()

 p.close()

 p.join()

 # # ###################################

 # # # download images/MultiProcess

 # # ###################################

 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself

 for i,urls in enumerate(img_urls):

     p.apply_async(download, [urls,download_dirs[i]])

 p.close()

 p.join()

python+selenium爬取关键字搜索google图片的更多相关文章

Python+Selenium爬取动态加载页面（2）
注: 上一篇<Python+Selenium爬取动态加载页面(1)>讲了基本地如何获取动态页面的数据,这里再讲一个稍微复杂一点的数据获取全国水雨情网.数据的获取过程跟人手动获取过程类似,所 ...
Python+Selenium爬取动态加载页面（1）
注: 最近有一小任务,需要收集水质和水雨信息,找了两个网站:国家地表水水质自动监测实时数据发布系统和全国水雨情网.由于这两个网站的数据都是动态加载出来的,所以我用了Selenium来完成我的数据获取. ...
Python+selenium爬取智联招聘的职位信息
整个爬虫是基于selenium和Python来运行的,运行需要的包 mysql,matplotlib,selenium 需要安装selenium火狐浏览器驱动,百度的搜寻. 整个爬虫是模块化组织的,不 ...
Python 爬虫爬取煎蛋网图片
今天, 试着爬取了煎蛋网的图片. 用到的包: urllib.request os 分别使用几个函数,来控制下载的图片的页数,获取图片的网页,获取网页页数以及保存图片到本地.过程简单清晰明了直接上源代 ...
python+selenium爬取百度文库不能下载的word文档
有些时候我们需要用到百度文库的某些文章时,却发现需要会员才能下载,很难受,其实我们可以通过爬虫的方式来获取到我们所需要的文本. 工具:python3.7+selenium+任意一款编辑器前期准备:可 ...
爬虫之selenium爬取斗鱼主播图片
这是我GitHub上简单的selenium介绍与简单使用:https://github.com/bwyt/spider/tree/master/selenium%E5%9F%BA%E7%A1%80 & ...
python selenium 爬取淘宝
# -*- coding:utf-8 -*- # author : yesehngbao # time:2018/3/29 import re import pymongo from lxml imp ...
python selenium爬取QQ空间方法
from selenium import webdriver import time # 打开浏览器 dr = webdriver.Chrome() # 打开某个网址 dr.get('https:// ...
python selenium爬取自如租房数据保存到TXT文件
# -*- coding: utf-8 -*-"""Created on Fri Aug 31 2018 @author: chenlinlab"" ...

随机推荐

C# System.Web.Mail.MailMessage 发邮件
C# System.Web.Mail.MailMessage 发邮件新建控制台Console项目,然后添加 System.Web引用代码如下: using System; using System ...
python3.6安装 zipimport.ZipImportError: can't decompress data; zlib not available【转】
python3.6.3 安装: .tgz cd Python- ./configure make make altinstall `make altinstall` , 报错: zipimport.Z ...
android: 结合BottomNavigationView、ViewPager和Fragment 实现左右滑动的效果
主界面:MainActivity package com.yongdaimi.android.androidapitest; import android.os.Bundle; import andr ...
vue---监听浏览器窗口的宽度
使用VUE开发后台项目,后台项目需要进行后台根据浏览器窗口进行变化,需要使用vue来监听浏览器的窗口变化. <template> <div class="conte ...
elasticsearch jestclient api
1.es search sroll 可以遍历索引下所有数据 public class TestDemo { @Test public void searchSroll() { JestClientFa ...
apicloud打包成apk
前言:本文是打包vue项目,其他项目也是这样打包页面的开发过程跟我们平时开发一样,利用vue把页面全部完成,最后进行npm run build将项目打包. 接下来就是apicloud打包的过程,首先 ...
postgrelsql 的 wm_concat ： string_agg
string_agg,array_agg 这两个函数的功能大同小异,只不过合并数据的类型不同 array_agg(expression) 把表达式变成一个数组一般配合 array_to_string ...
RAID磁盘阵列结构
RAID磁盘阵列结构原理其实很简单,就是每块硬盘不插在主板的硬盘接口上了,而是全插在RAID卡上,然后RAID卡再插到主板上,由RAID卡统一管理硬盘,做各种RAID磁盘策略(RAID0,RAID1, ...
【视频开发】【计算机视觉】doppia编译之一：前言及安装CUDA
最近做一个"高清视频人流量检测"的项目,由于对实时性要求较高,我们需要较快的检测速度.在搜索茫茫"论"海后,我在"The Fastest Deform ...
eclipse 查看文件在磁盘里的位置

python+selenium爬取关键字搜索google图片

python+selenium爬取关键字搜索google图片的更多相关文章

随机推荐

热门专题