Python用哈希算法查找相似图片（包括不同分辨率，不同大小，不同格式的图片）

# -*- coding: utf-8 -*-

'''

Python用哈希算法查找相似图片并放入[_df]的文件夹中

相似图片包括不同分辨率，不同大小，不同格式，只要图片相似就会算重复文件

安装cv2

pip install opencv-python

'''

import os

import cv2

import numpy as np

import shutil

import random

class DuplicateFiles (object):

    dir = ''

    def __init__(self, dir):

        self.dir = dir  # 实例属性

    # 均值哈希算法

    def aHash(self,img,shape=(10,10)):

        # 缩放为10*10

        img = cv2.resize(img, shape)

        # 转换为灰度图

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # s为像素和初值为0，hash_str为hash值初值为''

        s = 0

        hash_str = ''

        # 遍历累加求像素和

        for i in range(shape[0]):

            for j in range(shape[1]):

                s = s + gray[i, j]

        # 求平均灰度

        avg = s / 100

        # 灰度大于平均值为1相反为0生成图片的hash值

        for i in range(shape[0]):

            for j in range(shape[1]):

                if gray[i, j] > avg:

                    hash_str = hash_str + '1'

                else:

                    hash_str = hash_str + '0'

        return hash_str

    # 差值感知算法

    def dHash(self,img,shape=(10,10)):

        # 缩放10*11

        img = cv2.resize(img, (shape[0]+1, shape[1]))

        # 转换灰度图

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        hash_str = ''

        # 每行前一个像素大于后一个像素为1，相反为0，生成哈希

        for i in range(shape[0]):

            for j in range(shape[1]):

                if gray[i, j] > gray[i, j + 1]:

                    hash_str = hash_str + '1'

                else:

                    hash_str = hash_str + '0'

        return hash_str

    # 感知哈希算法(pHash)

    def pHash(self,img,shape=(10,10)):

        # 缩放32*32

        img = cv2.resize(img, (32, 32))  # , interpolation=cv2.INTER_CUBIC

        # 转换为灰度图

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # 将灰度图转为浮点型，再进行dct变换

        dct = cv2.dct(np.float32(gray))

        # opencv实现的掩码操作

        dct_roi = dct[0:10, 0:10]

        hash = []

        avreage = np.mean(dct_roi)

        for i in range(dct_roi.shape[0]):

            for j in range(dct_roi.shape[1]):

                if dct_roi[i, j] > avreage:

                    hash.append(1)

                else:

                    hash.append(0)

        return hash

    # 通过得到RGB每个通道的直方图来计算相似度

    def classify_hist_with_split(self,image1, image2, size=(256, 256)):

        # 将图像resize后，分离为RGB三个通道，再计算每个通道的相似值

        image1 = cv2.resize(image1, size)

        image2 = cv2.resize(image2, size)

        sub_image1 = cv2.split(image1)

        sub_image2 = cv2.split(image2)

        sub_data = 0

        for im1, im2 in zip(sub_image1, sub_image2):

            sub_data += self.calculate(im1, im2)

        sub_data = sub_data / 3

        return sub_data

    # 计算单通道的直方图的相似值

    def calculate(self,image1, image2):

        hist1 = cv2.calcHist([image1], [0], None, [256], [0.0, 255.0])

        hist2 = cv2.calcHist([image2], [0], None, [256], [0.0, 255.0])

        # 计算直方图的重合度

        degree = 0

        for i in range(len(hist1)):

            if hist1[i] != hist2[i]:

                degree = degree + (1 - abs(hist1[i] - hist2[i]) / max(hist1[i], hist2[i]))

            else:

                degree = degree + 1

        degree = degree / len(hist1)

        return degree

    # Hash值对比

    def cmpHash(self,hash1, hash2,shape=(10,10)):

        n = 0

        # hash长度不同则返回-1代表传参出错

        if len(hash1)!=len(hash2):

            return -1

        # 遍历判断

        for i in range(len(hash1)):

            # 相等则n计数+1，n最终为相似度

            if hash1[i] == hash2[i]:

                n = n + 1

        return n/(shape[0]*shape[1])

    def mymovefile(self,srcfile,dstpath,ffname):           # 移动函数

        if not os.path.isfile(srcfile):

            print ("%s not exist!"%(srcfile))

        else:

            fpath,fname=os.path.split(srcfile)             # 分离文件名和路径

            if(ffname):fname=ffname

            if not os.path.exists(dstpath):

                os.makedirs(dstpath)                       # 创建路径

            shutil.move(srcfile, dstpath + fname)          # 移动文件

            #print ("move %s -> %s"%(srcfile, dstpath + fname))

    # 定义函数

    def list_all_files(self,rootdir):

        _files = []

    	# 列出文件夹下所有的目录与文件

        list = os.listdir(rootdir)

        for i in range(0, len(list)):

    		# 构造路径

            path = os.path.join(rootdir, list[i])

    		# 判断路径是否为文件目录或者文件

    		# 如果是目录则继续递归

            if os.path.isdir(path):

                _files.extend(list_all_files(path))

            if os.path.isfile(path):

                _files.append(path)

        return _files

    #处理文件

    def mvPhoto(self):

        photoList = self.list_all_files(self.dir)

        #print(photoList)

        for i,photo in enumerate(photoList):

            mvPhoto = False #是否移动主文件

            #如果不是文件则跳出

            if(not os.path.isfile(photo)):

                continue

            fpath,fname=os.path.split(photo)

            print('Master:'+fname)

            ffname = fname.split('.')

            #不是下列文件形式跳出

            if(ffname[1] not in {'jpg', 'bmp', 'png', 'jpeg', 'gif'}):

                continue

            img1 = cv2.imdecode(np.fromfile(photo,dtype=np.uint8),cv2.IMREAD_COLOR)

            for j in range(i+1,len(photoList)):

                #print('  ',j,photoList[j])

                if(not os.path.isfile(photo) or not os.path.isfile(photoList[j])):

                    continue

                spath,sname=os.path.split(photoList[j])

                #print(sname)

                ssname = sname.split('.')

                if(ssname[1] not in {'jpg', 'bmp', 'png', 'jpeg', 'jfif'}):

                    continue

                #img1 = cv2.imread(photo)

                img2 = cv2.imdecode(np.fromfile(photoList[j],dtype=np.uint8),cv2.IMREAD_COLOR)

                #hash1 = aHash(img1)

                #hash2 = aHash(img2)

                n1 = self.cmpHash(self.aHash(img1), self.aHash(img2))

                n2 = self.cmpHash(self.dHash(img1), self.dHash(img2))

                n3 = self.cmpHash(self.pHash(img1), self.pHash(img2))

                n4 = self.classify_hist_with_split(img1, img2)

                n5 = self.calculate(img1, img2)

                #print('    ',n1,n2,n3,n4,n5)

                if(n1>0.90 or n2>0.90 or n3>0.90 or n4>0.90 or n5>0.90):

                    mvPhoto = True

                    print('    move file:'+photoList[j])

                    if(os.path.isfile(photoList[j])):

                        print('ffname[0]:'+ffname[0])

                        #mymovefile(photoList[j],dir+'_重复'+'/',ffname[0]+'_'+str(random.randint(10,99))+'.'+ffname[1])

                        self.mymovefile(photoList[j],dir+'_df'+'/',ffname[0]+'_'+sname)

            #最后移动主文件

            if(mvPhoto==True):

                self.mymovefile(photo,dir+'_df'+'/',fname)

if __name__ == "__main__":

    #指定路径

    #dir = r'E:\python\photoCompare\328' #指定目录地址

    dir = os.getcwd()                    #当前文件所在目录

    duplicateFiles = DuplicateFiles(dir)

    duplicateFiles.mvPhoto()

Python用哈希算法查找相似图片（包括不同分辨率，不同大小，不同格式的图片）的更多相关文章

将jpg压缩成webp格式的图片
cwebp名称 cwebp -压缩图像文件为的WebP文件概要 cwebp [选项] INPUT_FILE -o output_file.webp描述 cwebp压缩使用的WebP格式的图像.输入格式 ...
iPhone照片格式heic图片怎么打开
苹果自iOS11系统之后默认的是heic图片格式,在电脑和安卓中都无法直接查看,需要将其转换图片格式,那苹果heic图片怎么转换成jpg格式?下面我们一起来看看吧! 使用工具:电脑.图片操作方法: ...
感知哈希算法——Python实现【转】
转自:https://blog.csdn.net/m_buddy/article/details/78887248 版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原 ...
Iconfinder 如何杜绝盗版，哈希算法检测图像重复
原地址:http://blog.jobbole.com/65914/ 本文由伯乐在线 - 小鱼翻译自 Silviu Tantos.欢迎加入技术翻译小组.转载请参见文章末尾处的要求. [伯乐在线导读 ...
Atitit.java图片图像处理attilax总结 BufferedImage extends java.awt.Image获取图像像素点image.getRGB(i, lineIndex); 图片剪辑/AtiPlatf_cms/src/com/attilax/img/imgx.javacutImage图片处理titit 判断判断一张图片是否包含另一张小图片 atitit 图片去噪算法的原理与
Atitit.java图片图像处理attilax总结 BufferedImage extends java.awt.Image 获取图像像素点 image.getRGB(i, lineIndex); ...
Notes:一致性哈希算法
业务场景: 存在三个专门提供缓存服务的服务器,前端所需要的图片等静态资源被缓存于这三个服务器其中之一. 但是如何提高查找图片的速度呢? 可以采用哈希算法. 常规意义上的哈希算法: 通过hash(图片名 ...
os常用模块，json，pickle，shelve模块，正则表达式（实现运算符分离），logging模块，配置模块，路径叠加，哈希算法
一.os常用模块显示当前工作目录 print(os.getcwd()) 返回上一层目录 os.chdir("..") 创建文件包 os.makedirs('python2/bin ...
转白话解析：一致性哈希算法 consistent hashing
摘要: 本文首先以一个经典的分布式缓存的应用场景为铺垫,在了解了这个应用场景之后,生动而又不失风趣地介绍了一致性哈希算法,同时也明确给出了一致性哈希算法的优点.存在的问题及其解决办法. 声明与致谢: ...
Python多线程问题的资料查找与汇总
Python多线程问题的资料查找与汇总声明: 1)本报告由博客园bitpeach撰写,版权所有,免费转载,请注明出处,并请勿作商业用途. 2)若本文档内有侵权文字或图片等内容,请联系作者bitpea ...
ELFhash - 优秀的字符串哈希算法
ELFhash - 优秀的字符串哈希算法 2016年10月29日 22:12:37 阅读数:6440更多个人分类: 算法杂论算法精讲数据结构所属专栏: 算法与数据结构版权声明:本文为博主原创 ...

随机推荐

ajax的async异步执行属性
遇到了一个ajax,看到了一个属性,async,是用来设置同步执行,或者是异步执行的举一个例子: $.ajax({ async: false, type : "post", ...
linux系统下载redis时make报错：没有名为什么》》》》》
明明自己下载了gcc-c++环境,但是make还是一直报错,没有名为什么的>>>>> 其实这个问题主要的原因的是gcc的版本过低了,你可以gcc -v查看一下你的版本,是 ...
springboot项目导出excel实现
参见:https://blog.csdn.net/duli_0105/article/details/102809936
Unity_飞机大战记录总结
记录步骤:win+R→PSR.exe 一.竖屏设置分辨率设为9:16 二.主控脚本添加一个空节点,命名"游戏主控" 新建游戏的主控脚本,命名为MyGame.cs,方便管理(即, ...
CanvasScaler的三种适配模式——缩放模式（Scale with Screen Size）
一.含义根据屏幕尺寸进行缩放,随着屏幕尺寸进行放大缩小二.参数介绍第一个参数一般是美术人员根据游戏主要面向的手机市场,比如安卓市场,用市场上最常用的分辨率作为制作UI图片的标准.这里填的数就是美 ...
根据已知经纬度导出对应地点的NDVI值
首先把经纬度和NDVI投影到同一个坐标上,其次把excel的经纬度到出城shp格式然后执行如下步骤: [工具][spatial analyst 工具][提取分析][多值提取至点]
如何在 Net6.0 中对 WebAPI 进行 JWT 认证和授权
一.简介我们做微服务开发,或者说做分布式开发,有一项技术我们是避不开的,那就是WebAPI,在 Net6.0中,有两类 WebAPI,一类是极简 WebAPI,它砍掉了很多冗余的东西,更纯粹的是做 ...
WinUI（WASDK）使用ChatGPT和摄像头手势识别结合TTS让机器人更智能
前言之前写过一篇基于ML.NET的手部关键点分类的博客,可以根据图片进行手部的提取分类,于是我就将手势分类和摄像头数据结合,集成到了我开发的电子脑壳软件里. 电子脑壳是一个为稚晖君开源的桌面机器人E ...
使用shell 方式对 vcenter 进行补丁升级
使用shell 方式对 vcenter 进行补丁升级背景:最近VMware官网发布了最新的VMware vCenter Server 7.0 iso补丁文件,为了安全起故此对vCenter 进行安全 ...
3.错误代码C4996
3.错误代码C4996 错误 C4996 'strcpy': This function or variable may be unsafe. Consider using strcpy_s inst ...

Python用哈希算法查找相似图片（包括不同分辨率，不同大小，不同格式的图片）

Python用哈希算法查找相似图片（包括不同分辨率，不同大小，不同格式的图片）的更多相关文章

随机推荐

热门专题