LZ77.py

import math

from bitarray import bitarray

class LZ77Compressor:

	"""

	A simplified implementation of the LZ77 Compression Algorithm

	"""

	MAX_WINDOW_SIZE = 400

	def __init__(self, window_size=20):

		self.window_size = min(window_size, self.MAX_WINDOW_SIZE)

		self.lookahead_buffer_size = 15 # length of match is at most 4 bits

	def compress(self, input_file_path, output_file_path=None, verbose=False):

		"""

		Given the path of an input file, its content is compressed by applying a simple

		LZ77 compression algorithm. 

		The compressed format is:

		0 bit followed by 8 bits (1 byte character) when there are no previous matches

			within window

		1 bit followed by 12 bits pointer (distance to the start of the match from the

			current position) and 4 bits (length of the match)

		If a path to the output file is provided, the compressed data is written into

		a binary file. Otherwise, it is returned as a bitarray

		if verbose is enabled, the compression description is printed to standard output

		"""

		data = None

		i = 0

		output_buffer = bitarray(endian='big')

		# read the input file

		try:

			with open(input_file_path, 'rb') as input_file:

				data = input_file.read()

		except IOError:

			print 'Could not open input file ...'

			raise

		while i < len(data):

			#print i

			match = self.findLongestMatch(data, i)

			if match:

				# Add 1 bit flag, followed by 12 bit for distance, and 4 bit for the length

				# of the match

				(bestMatchDistance, bestMatchLength) = match

				output_buffer.append(True)

				output_buffer.frombytes(chr(bestMatchDistance >> 4))

				output_buffer.frombytes(chr(((bestMatchDistance & 0xf) << 4) | bestMatchLength))

				if verbose:

					print "<1, %i, %i>" % (bestMatchDistance, bestMatchLength),

				i += bestMatchLength

			else:

				# No useful match was found. Add 0 bit flag, followed by 8 bit for the character

				output_buffer.append(False)

				output_buffer.frombytes(data[i])

				if verbose:

					print "<0, %s>" % data[i],

				i += 1

		# fill the buffer with zeros if the number of bits is not a multiple of 8

		output_buffer.fill()

		# write the compressed data into a binary file if a path is provided

		if output_file_path:

			try:

				with open(output_file_path, 'wb') as output_file:

					output_file.write(output_buffer.tobytes())

					print "File was compressed successfully and saved to output path ..."

					return None

			except IOError:

				print 'Could not write to output file path. Please check if the path is correct ...'

				raise

		# an output file path was not provided, return the compressed data

		return output_buffer

	def decompress(self, input_file_path, output_file_path=None):

		"""

		Given a string of the compressed file path, the data is decompressed back to its

		original form, and written into the output file path if provided. If no output

		file path is provided, the decompressed data is returned as a string

		"""

		data = bitarray(endian='big')

		output_buffer = []

		# read the input file

		try:

			with open(input_file_path, 'rb') as input_file:

				data.fromfile(input_file)

		except IOError:

			print 'Could not open input file ...'

			raise

		while len(data) >= 9:

			flag = data.pop(0)

			if not flag:

				byte = data[0:8].tobytes()

				output_buffer.append(byte)

				del data[0:8]

			else:

				byte1 = ord(data[0:8].tobytes())

				byte2 = ord(data[8:16].tobytes())

				del data[0:16]

				distance = (byte1 << 4) | (byte2 >> 4)

				length = (byte2 & 0xf)

				for i in range(length):

					output_buffer.append(output_buffer[-distance])

		out_data =  ''.join(output_buffer)

		if output_file_path:

			try:

				with open(output_file_path, 'wb') as output_file:

					output_file.write(out_data)

					print 'File was decompressed successfully and saved to output path ...'

					return None

			except IOError:

				print 'Could not write to output file path. Please check if the path is correct ...'

				raise

		return out_data

	def findLongestMatch(self, data, current_position):

		"""

		Finds the longest match to a substring starting at the current_position

		in the lookahead buffer from the history window

		"""

		end_of_buffer = min(current_position + self.lookahead_buffer_size, len(data) + 1)

		best_match_distance = -1

		best_match_length = -1

		# Optimization: Only consider substrings of length 2 and greater, and just

		# output any substring of length 1 (8 bits uncompressed is better than 13 bits

		# for the flag, distance, and length)

		for j in range(current_position + 2, end_of_buffer):

			start_index = max(0, current_position - self.window_size)

			substring = data[current_position:j]

			for i in range(start_index, current_position):

				repetitions = len(substring) / (current_position - i)

				last = len(substring) % (current_position - i)

				matched_string = data[i:current_position] * repetitions + data[i:i+last]

				if matched_string == substring and len(substring) > best_match_length:

					best_match_distance = current_position - i

					best_match_length = len(substring)

		if best_match_distance > 0 and best_match_length > 0:

			return (best_match_distance, best_match_length)

		return None

LZ77.py的更多相关文章

python调用py中rar的路径问题。
1.python调用py,在py中的os.getcwd()获取的不是py的路径,可以通过os.path.split(os.path.realpath(__file__))[0]来获取py的路径. 2. ...
Python导入其他文件中的.py文件即模块
import sys sys.path.append("路径") import .py文件
LZ77压缩算法编码原理详解(结合图片和简单代码)
前言 LZ77算法是无损压缩算法,由以色列人Abraham Lempel发表于1977年.LZ77是典型的基于字典的压缩算法,现在很多压缩技术都是基于LZ77.鉴于其在数据压缩领域的地位,本文将结合图 ...
import renumber.py in pymol
cp renumber.py /usr/local/lib/python2.7/dist-packages/pymol import renumber or run /path/to/renumber ...
python gettitle.py
#!/usr/bin/env python # coding=utf-8 import threading import requests import Queue import sys import ...
解决 odoo.py: error: option --addons-path: The addons-path 'local-addons/' does not seem to a be a valid Addons Directory!
情况说明 odoo源文件路径-/odoo-dev/odoo/: 我的模块插件路径 ~/odoo-dev/local-addons/my-module 在my-module中创建了__init__.py ...
caffe机器学习自带图片分类器classify.py实现输出预测结果的概率及caffe的web_demo例子运行实例
caffe机器学习环境搭建及python接口编译参见我的上一篇博客:机器学习caffe环境搭建--redhat7.1和caffe的python接口编译 1.运行caffe图片分类器python接口还 ...
【转】Windows下使用libsvm中的grid.py和easy.py进行参数调优
libsvm中有进行参数调优的工具grid.py和easy.py可以使用,这些工具可以帮助我们选择更好的参数,减少自己参数选优带来的烦扰. 所需工具:libsvm.gnuplot 本机环境:Windo ...
MySqlNDB使用自带的ndb_setup.py安装集群
在用Mysql做集群时,使用Mysql的NDB版本更易于集群的扩展,稳定和数据的实时性. 我们可以使用Mysql自带的工具进行集群安装与管理:ndb_setup.py.位于Mysql的安装目录bin下 ...

随机推荐

Javascript 中 with 的替代方案和String 中的正则方法
这几天在升级自己的MVVM 框架,遇到很多小问题,就在这里统一解决了. with 语法在代码中,要执行这么一个函数 function computeExpression(exp, scope) { ...
Oracle --> Vertica 数据类型转换规则
需求:在Vertica数据库上建表,表结构来源于原Oracle数据库,故需要转换成Vertica数据库库表结构. 实际转换操作需要评估源库用到的所有数据类型和数据本身特性. 下面是总结的某场景下的 ...
PHP用单例模式实现一个数据库类
使用单例模式的出发点: 1.php的应用主要在于数据库应用, 所以一个应用中会存在大量的数据库操作, 使用单例模式, 则可以避免大量的new 操作消耗的资源. 2.如果系统中需要有一个类来全局控制某些 ...
python 添加tab补全
在平时查看Python方法用到tab补全还是很方便的. 1. mac 平台配置如下: mac是类Unix平台,需要在添加一条配置内容到bash_profile 中(默认是没有这个文件,可以新建一个放 ...
谈谈对Spring IOC的理解（转）
学习过Spring框架的人一定都会听过Spring的IoC(控制反转) .DI(依赖注入)这两个概念,对于初学Spring的人来说,总觉得IoC .DI这两个概念是模糊不清的,是很难理解的,今天和大家 ...
流程控制和循环.png
Reactjs-JQuery-Vuejs-Extjs-Angularjs对比
写在前面前端越来越混乱了,当然也可以美其名曰:繁荣. 当新启动一个前端项目,第一件事就是纠结:使用什么框架,重造什么轮子? 那么,希望看完此篇,能够给你一个清晰的认识,或者让你更加地纠结和无所适从 ...
Android 使用pull,sax解析xml
pull解析xml文件 1.获得XmlpullParser类的引用这里有两种方法 //解析器工厂 XmlPullParserFactory factory=XmlPullParserFactory. ...
UITextField
UITextFieldDemo 效果特点 1.有效定制键盘的样式 2.处理键盘对文本框的遮挡用法 1.导入文件(UITextField+CreateInputAccessoryView.h/.m) ...
iOS 正确选择图片加载方式
正确选择图片加载方式能够对内存优化起到很大的作用,常见的图片加载方式有下面三种: //方法1 UIImage *imag1 = [UIImage imageNamed:@"image.png ...

LZ77.py

LZ77.py的更多相关文章

随机推荐

热门专题