使用 selenium 实现谷歌以图搜图爬虫

实现思路

原理非常简单，就是利用selenium去操作浏览器，获取到想要的链接，然后进行图片的下载，和一般的爬虫无异。

用到的技术：multiprocessing，selenium，xpath，requests

以下按照代码执行的顺序进行讲解。

首先导入需要的包

# coding=utf-8

import base64

import hashlib

import os

import re

import shutil

import time

from multiprocessing import Pool, cpu_count

import requests

import tqdm

from colorama import Fore

from selenium import webdriver

from selenium.common.exceptions import (ElementNotVisibleException,

                                        StaleElementReferenceException)

from selenium.webdriver.common.by import By

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.support.wait import WebDriverWait

定义一个 run()函数，作为入口。这里使用多进程技术，同时打开多个浏览器进行图片爬取。

def run():

    num_process = cpu_count() # 进程数设置为cpu核心数

    pool = Pool(num_process) # 建立一个进程池

    filelist = []

    upload = r"./upload" # 需要进行上传的图片文件夹

    getfilelist(upload, filelist)  # 递归查找文件夹里面所有的图片文件

    result = partition(filelist, num_process) # 将图片文件列表平均分为几个list，每个进程跑一部分

    pool.map_async(download_task, result) # 下载任务丢进进程池

    pool.close() # 不再允许加入进程池

    pool.join() # 等待进程完成

其中 getfilelist()函数是递归查找，工作中用得很多了。

EXTEND = [".bmp", ".jpg", ".jpeg", ".tif", ".tiff",

          ".jfif", ".png", ".gif", ".iff", ".ilbm"]

def is_img(img_path):

    # 根据后缀判断是否为图片

    ext = os.path.splitext(img_path)[1]

    if ext in EXTEND:

        return True

    else:

        return False

def getfilelist(path, filelist):

    file = os.listdir(path)

    for im_name in file:

        if os.path.isdir(os.path.join(path, im_name)):

            getfilelist(os.path.join(path, im_name), filelist)

        else:

            if is_img(im_name):

                name = os.path.join(path, im_name)

                filelist.append(name)

partition()函数用于将一个列表均分为几份，以便实现多进程。

def partition(ls, size):

    num_per_list = len(ls)//size

    result = []

    if num_per_list*size == len(ls):

        for i in range(size):

            result.append(ls[num_per_list*i:num_per_list*(i+1)])

    else:

        for i in range(size-1):

            result.append(ls[num_per_list*i:num_per_list*(i+1)])

        result.append(ls[num_per_list*(size-1):])

    return result

download_task()为具体的下载任务，一个task实例化一个GoogleSearcher类，遍历自己的图片列表进行以图搜图。

def download_task(filelist):

    searcher = GoogleSearcher(

        download=r"./download")

    for file in filelist:

        searcher.simple_file_run(file)  # 上传单张图并进行以图搜图

GoogleSearcher类比较长，在注释中进行讲解。

USERNAME = os.environ['USERNAME']

class GoogleSearcher:

    def __init__(self, download="download", sleep_time=1):

        super().__init__()

        self._download = download # 下载文件夹

        self.sleep_time = sleep_time  # 下载页面时等待时间

        self.header = {

            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}

        os.makedirs(self._download, exist_ok=True)  # 创建下载文件夹

        self.option = webdriver.ChromeOptions()

        # self.option.add_argument("--user-data-dir=" + f"C:/Users/{USERNAME}/AppData/Local/Google/Chrome/User Data/")

        # self.option.add_argument("headless")  # if use headless, may failed.

        self.option.add_argument("disable-gpu")

        self.driver = webdriver.Chrome(options=self.option) # 以上为浏览器对象创建

    def upload_img_get_html(self, file):

    	# 上传图片并转到图片列表页面

        print(

            f"{Fore.GREEN} Begin to upload image {os.path.split(file)[1]} {Fore.RESET}")

        self.driver.get("https://www.google.com/imghp")

        # 等待相机按钮出现

        condition_1 = EC.visibility_of_element_located(

            (By.CLASS_NAME, "LM8x9c"))

        WebDriverWait(self.driver, timeout=20,

                      poll_frequency=0.5).until(condition_1)

        # 相机按钮出现后点击

        image_button = self.driver.find_element_by_class_name("LM8x9c")

        image_button.send_keys(Keys.ENTER)

        # 等待出现上传图片字样

        condition_2 = EC.visibility_of_element_located(

            (By.ID, "dRSWfb"))

        WebDriverWait(self.driver, timeout=20, poll_frequency=0.5).until(

            condition_2)

        # 点击上传图片

        upload = self.driver.find_element_by_xpath('//*[@id="dRSWfb"]/div/a')

        upload.send_keys(Keys.ENTER)

        # 找到上传图片的控件

        condition_3 = EC.visibility_of_element_located(

            (By.ID, 'awyMjb'))

        WebDriverWait(self.driver, timeout=10, poll_frequency=0.5).until(

            condition_3)

        input_ = self.driver.find_element_by_id('awyMjb')

        # 因为上传图片的控件是一个input,直接将文件send就行

        input_.send_keys(file)

        print(f"{Fore.GREEN} uploaded {Fore.RESET}")

        # 页面转向另一页

        condition_4 = EC.visibility_of_element_located(

            (By.XPATH, '//*[@id="top_nav"]'))

        WebDriverWait(self.driver, timeout=20,

                      poll_frequency=0.5).until(condition_4)

        # 等待片刻

        time.sleep(self.sleep_time)

        # print(driver.current_url)

        # print(driver.page_source)

        print(f"{Fore.GREEN} Finish download source code{Fore.RESET}")

        return self.driver.page_source

    def highlight(self, element):

        self.driver.execute_script(

            "arguments[0].setAttribute('style', arguments[1]);", element, "background: yellow; border: 2px solid red;")

    def wait_and_click(self, xpath):

        #  Sometimes click fails unreasonably. So tries to click at all cost.

        try:

            w = WebDriverWait(self.driver, 15)

            elem = w.until(EC.element_to_be_clickable((By.XPATH, xpath)))

            elem.click()

            self.highlight(elem)

        except Exception as e:

            print('Click time out - {}'.format(xpath))

            print('Refreshing browser...')

            self.browser.refresh()

            time.sleep(2)

            return self.wait_and_click(xpath)

        return elem

    def get_extension_from_link(self, link, default='jpg'):

    # 获取文件后缀

        splits = str(link).split('.')

        if len(splits) == 0:

            return default

        ext = splits[-1].lower()

        if ext == 'jpg' or ext == 'jpeg':

            return 'jpg'

        elif ext == 'gif':

            return 'gif'

        elif ext == 'png':

            return 'png'

        else:

            return default

    def base64_to_object(self, src):

    # base64 解码

        header, encoded = str(src).split(',', 1)

        data = base64.decodebytes(bytes(encoded, encoding='utf-8'))

        return data

    def download_images(self, links, download_dir):

    # 下载图片

        total = len(links)

        for index, link in enumerate(links):

            try:

                if len(link) < 100:

                    print('Downloading {} : {} / {}'.format(link, index + 1, total))

                else:

                    print(

                        'Downloading {} : {} / {}'.format(link[:100], index + 1, total))

                        # 链接过长，只打印部分

                if str(link).startswith('data:image/jpeg;base64'):

                # base64编码的jpg图片

                    response = self.base64_to_object(src=link)

                    ext = 'jpg'

                    is_base64 = True

                elif str(link).startswith('data:image/png;base64'):

                # base64编码的png图片

                    response = self.base64_to_object(src=link)

                    ext = 'png'

                    is_base64 = True

                else:

                # 图片超链接

                    response = requests.get(link, stream=True, timeout=5)

                    ext = self.get_extension_from_link(link=link)

                    is_base64 = False

                path = os.path.join(download_dir, str(index).zfill(4)+"."+ext)

                try:

                    with open(path, "wb") as f:

                    # base64图片和超链接图片两种保存方法

                        if is_base64:

                            f.write(response)

                        else:

                            shutil.copyfileobj(response.raw, f)

                except Exception as e:

                    print('Save failed - {}'.format(e))

                del response

            except Exception as e:

                print('Download failed - ', e)

                continue

    def get_full_resolution_links(self):

        print('[Full Resolution Mode]')

        time.sleep(1)

        elem = self.driver.find_element_by_tag_name("body")

        print('Scraping links')

        self.wait_and_click('//div[@data-ri="0"]')

        time.sleep(1)

        links = []

        count = 1

        last_scroll = 0

        scroll_patience = 0

        while True:

            try:

                xpath = '//div[@id="islsp"]//div[@class="v4dQwb"]'

                div_box = self.driver.find_element(By.XPATH, xpath)

                self.highlight(div_box)

                xpath = '//img[@class="n3VNCb"]'

                img = div_box.find_element(By.XPATH, xpath)

                self.highlight(img)

                xpath = '//div[@class="k7O2sd"]'

                loading_bar = div_box.find_element(By.XPATH, xpath)

                # 等待图片加载，如果加载不完，获取到的是 base64 编码的图片

                while str(loading_bar.get_attribute('style')) != 'display: none;':

                    time.sleep(0.1)

                src = img.get_attribute('src')

                if src is not None:

                    links.append(src)

                    if len(src) < 100:

                        print('%d: %s' % (count, src))

                    else:

                        print('%d: %s' % (count, src[:100])) # 如果太长，只打印一部分

                    count += 1

            except StaleElementReferenceException:

                pass

            except Exception as e:

                print(

                    '[Exception occurred while collecting links from google_full] {}'.format(e))

            scroll = self.driver.execute_script("return window.pageYOffset;") # 页面滚动的位置

            if scroll == last_scroll:

            # 页面滚动1

                scroll_patience += 1

            else:

                scroll_patience = 0

                last_scroll = scroll

            if scroll_patience >= 30:

            #页面滚动30，停止

                break

            elem.send_keys(Keys.RIGHT)

        links = list(dict.fromkeys(links)) # 链接去重

        print('Collect links done. Total: {}'.format(len(links)))

        return links

    def simple_file_run(self, img):

        # 上传图片并进行搜索

        img_name = os.path.splitext(os.path.split(img)[1])[0] # 图片名

        parent_name = os.path.split(os.path.split(img)[0])[-1] # 图片的父级名字，用来区分图片的类别

        print("--> Processing image:  {}  ".format(img_name))

        download_dir = os.path.join(self._download, parent_name, img_name)

        os.makedirs(download_dir, exist_ok=True)

        html_source = self.upload_img_get_html(img)  # 上传图片，到搜索结果页

        similar_img_href = self.driver.find_element_by_xpath(

            '//div[@class="e2BEnf U7izfe"]/h3/a')

        similar_img_href.click()  # 查找“类似图片”的链接并点击，进入图片列表页

        links = self.get_full_resolution_links()  # 将所有图片的大图链接进行收集

        self.download_images(links, download_dir)  # 下载这些大图

        print("{}Image {} finished\n{}".format(

            Fore.GREEN, img_name, Fore.RESET))

整个流程就跟打开浏览器进行操作一样，难点在于如何控制速度，不被谷歌反爬，不然出现谷歌验证码，破解是不可能的，就要帮它免费打码了。

有何用途

当你需要训练一个图片分类的模型，手头上图片有限，那就可以用这个方法，每一张图都找跟它相似的，轻轻松松就把训练集扩大了几十倍（理想情况，不被反爬的话）。

参考

使用 selenium 实现谷歌以图搜图爬虫的更多相关文章

以图搜图(一)：Python实现dHash算法（转）
近期研究了一下以图搜图这个炫酷的东西.百度和谷歌都有提供以图搜图的功能,有兴趣可以找一下.当然,不是很深入.深入的话,得运用到深度学习这货.Python深度学习当然不在话下. 这个功能最核心的东西就是 ...
Google 以图搜图 - 相似图片搜索原理 - Java实现
前阵子在阮一峰的博客上看到了这篇<相似图片搜索原理>博客,就有一种冲动要将这些原理实现出来了. Google "相似图片搜索":你可以用一张图片,搜索互联网上所有与它相 ...
Google 以图搜图 - 相似图片搜索原理 - Java实现（转）
前阵子在阮一峰的博客上看到了这篇<相似图片搜索原理>博客,就有一种冲动要将这些原理实现出来了. Google "相似图片搜索":你可以用一张图片,搜索互联网上所有与它相 ...
以图搜图之模型篇：基于 InceptionV3 的模型 finetune
在以图搜图的过程中,需要以来模型提取特征,通过特征之间的欧式距离来找到相似的图形. 本次我们主要讲诉以图搜图模型创建的方法. 图片预处理方法,看这里:https://keras.io/zh/prepr ...
谷歌百度以图搜图 "感知哈希算法" C#简单实现
/// <summary> /// 感知哈希算法 /// </summary> public class ImageComparer { /// <summary> ...
[No000007]搜索引擎以图搜图的原理
之前,Google把"相似图片搜索"正式放上了首页. 你可以用一张图片,搜索互联网上所有与它相似的图片.点击搜索框中照相机的图标. 一个对话框会出现. 你输入网片的网址,或者直接上 ...
php 以图搜图
感知哈希算法count < =5 匹配最相似count > 10 两张不同的图片var_dump(ImageHash::run('1.jpg’, '2.jpg’)); <?php c ...
JFinal-美女图爬虫-一个不正经的爬虫代码
去年我做了一个项目,大量使用爬虫抓取数据,使用JFinal+JSoup组合,抓取数据,数据清洗筛选,最终保存到数据库里,结构化. 今天,我发布一个不正经的爬虫项目,如果你对JSoup做爬虫感兴趣,可以 ...
【网页浏览】国内伪P站搜图网站
蛮好用的国内p站搜图网站(伪p站) 传送链接

随机推荐

leetcode 每日签到 409. 最长回文串
题目: 最长回文串给定一个包含大写字母和小写字母的字符串,找到通过这些字母构造成的最长的回文串. 在构造过程中,请注意区分大小写.比如 "Aa" 不能当做一个回文字符串. 注意: ...
linux golden-dict个性化添加词典
国内有道,百度等参考https://www.jianshu.com/p/9bf577335945如果和我一样,想要添加大名鼎鼎的韦氏词典英文词典,则地址如下https://www.merriam-we ...
python之目录
一.python基础 python之字符串str操作方法 python之int (整型) python之bool (布尔值) python之str (字符型) python之ran ...
Unix 网络编程卷一源码编译踩坑记录 ubtutu 19.10
在阅读unpv1时运行源代码的环境配置,这里简单记录一下源代码里的README 写得挺详细的,但是在Linux 系统的下还没没办法直接编译通过的, 这里我使用的是ubuntu 19.10(在腾讯云1 ...
Sql Server数据库性能优化之索引
最近在做SQL Server数据库性能优化,因此复习下一索引.视图.存储过程等知识点.本篇为索引篇,知识整理来源于互联网. 索引加快检索表中数据的方法,它对数据表中一个或者多个列的值进行结构排序,是数 ...
读者来信-5 | 如果你家HBase集群Region太多请点进来看看，这个问题你可能会遇到
前言:<读者来信>是HBase老店开设的一个问答专栏,旨在能为更多的小伙伴解决工作中常遇到的HBase相关的问题.老店会尽力帮大家解决这些问题或帮你发出求救贴,老店希望这会是一个互帮互助的 ...
模块 schedule 定时任务
schedule模块实现定时任务 2018-08-29 15:01:51 更多一.官方示例 import schedule import time def job(): print("I' ...
CodeForces 190C STL
Portal: http://codeforces.com/problemset/problem/190/C 一道卡输入输出的蛋疼题题意:给你一个由pair和int所组成的沙茶字符串(最大含有1e5 ...
Java 为 Excel 中的行设置交替背景色
在制作Excel表格时,通过将数据表中上下相邻的两行用不同的背景色填充,可以使各行的数据看起来更清楚,避免看错行,同时也能增加Excel表格的美观度.本文将介绍如何在Java程序中为 Excel 奇数 ...
Docket 容器引擎
Docker 是世界领先的软件容器平台.是一个开源的应用容器引擎,让开发者可以打包他们的应用以及依赖包到一个可移植的镜像中, 然后发布到任何流行的Linux或Windows机器上,可以实现虚拟化(软件 ...

使用 selenium 实现谷歌以图搜图爬虫

实现思路

有何用途

参考

使用 selenium 实现谷歌以图搜图爬虫的更多相关文章

随机推荐

热门专题