LZ77.py

import math

from bitarray import bitarray

class LZ77Compressor:

	"""

	A simplified implementation of the LZ77 Compression Algorithm

	"""

	MAX_WINDOW_SIZE = 400

	def __init__(self, window_size=20):

		self.window_size = min(window_size, self.MAX_WINDOW_SIZE)

		self.lookahead_buffer_size = 15 # length of match is at most 4 bits

	def compress(self, input_file_path, output_file_path=None, verbose=False):

		"""

		Given the path of an input file, its content is compressed by applying a simple

		LZ77 compression algorithm. 

		The compressed format is:

		0 bit followed by 8 bits (1 byte character) when there are no previous matches

			within window

		1 bit followed by 12 bits pointer (distance to the start of the match from the

			current position) and 4 bits (length of the match)

		If a path to the output file is provided, the compressed data is written into

		a binary file. Otherwise, it is returned as a bitarray

		if verbose is enabled, the compression description is printed to standard output

		"""

		data = None

		i = 0

		output_buffer = bitarray(endian='big')

		# read the input file

		try:

			with open(input_file_path, 'rb') as input_file:

				data = input_file.read()

		except IOError:

			print 'Could not open input file ...'

			raise

		while i < len(data):

			#print i

			match = self.findLongestMatch(data, i)

			if match:

				# Add 1 bit flag, followed by 12 bit for distance, and 4 bit for the length

				# of the match

				(bestMatchDistance, bestMatchLength) = match

				output_buffer.append(True)

				output_buffer.frombytes(chr(bestMatchDistance >> 4))

				output_buffer.frombytes(chr(((bestMatchDistance & 0xf) << 4) | bestMatchLength))

				if verbose:

					print "<1, %i, %i>" % (bestMatchDistance, bestMatchLength),

				i += bestMatchLength

			else:

				# No useful match was found. Add 0 bit flag, followed by 8 bit for the character

				output_buffer.append(False)

				output_buffer.frombytes(data[i])

				if verbose:

					print "<0, %s>" % data[i],

				i += 1

		# fill the buffer with zeros if the number of bits is not a multiple of 8

		output_buffer.fill()

		# write the compressed data into a binary file if a path is provided

		if output_file_path:

			try:

				with open(output_file_path, 'wb') as output_file:

					output_file.write(output_buffer.tobytes())

					print "File was compressed successfully and saved to output path ..."

					return None

			except IOError:

				print 'Could not write to output file path. Please check if the path is correct ...'

				raise

		# an output file path was not provided, return the compressed data

		return output_buffer

	def decompress(self, input_file_path, output_file_path=None):

		"""

		Given a string of the compressed file path, the data is decompressed back to its

		original form, and written into the output file path if provided. If no output

		file path is provided, the decompressed data is returned as a string

		"""

		data = bitarray(endian='big')

		output_buffer = []

		# read the input file

		try:

			with open(input_file_path, 'rb') as input_file:

				data.fromfile(input_file)

		except IOError:

			print 'Could not open input file ...'

			raise

		while len(data) >= 9:

			flag = data.pop(0)

			if not flag:

				byte = data[0:8].tobytes()

				output_buffer.append(byte)

				del data[0:8]

			else:

				byte1 = ord(data[0:8].tobytes())

				byte2 = ord(data[8:16].tobytes())

				del data[0:16]

				distance = (byte1 << 4) | (byte2 >> 4)

				length = (byte2 & 0xf)

				for i in range(length):

					output_buffer.append(output_buffer[-distance])

		out_data =  ''.join(output_buffer)

		if output_file_path:

			try:

				with open(output_file_path, 'wb') as output_file:

					output_file.write(out_data)

					print 'File was decompressed successfully and saved to output path ...'

					return None

			except IOError:

				print 'Could not write to output file path. Please check if the path is correct ...'

				raise

		return out_data

	def findLongestMatch(self, data, current_position):

		"""

		Finds the longest match to a substring starting at the current_position

		in the lookahead buffer from the history window

		"""

		end_of_buffer = min(current_position + self.lookahead_buffer_size, len(data) + 1)

		best_match_distance = -1

		best_match_length = -1

		# Optimization: Only consider substrings of length 2 and greater, and just

		# output any substring of length 1 (8 bits uncompressed is better than 13 bits

		# for the flag, distance, and length)

		for j in range(current_position + 2, end_of_buffer):

			start_index = max(0, current_position - self.window_size)

			substring = data[current_position:j]

			for i in range(start_index, current_position):

				repetitions = len(substring) / (current_position - i)

				last = len(substring) % (current_position - i)

				matched_string = data[i:current_position] * repetitions + data[i:i+last]

				if matched_string == substring and len(substring) > best_match_length:

					best_match_distance = current_position - i

					best_match_length = len(substring)

		if best_match_distance > 0 and best_match_length > 0:

			return (best_match_distance, best_match_length)

		return None

LZ77.py的更多相关文章

python调用py中rar的路径问题。
1.python调用py,在py中的os.getcwd()获取的不是py的路径,可以通过os.path.split(os.path.realpath(__file__))[0]来获取py的路径. 2. ...
Python导入其他文件中的.py文件即模块
import sys sys.path.append("路径") import .py文件
LZ77压缩算法编码原理详解(结合图片和简单代码)
前言 LZ77算法是无损压缩算法,由以色列人Abraham Lempel发表于1977年.LZ77是典型的基于字典的压缩算法,现在很多压缩技术都是基于LZ77.鉴于其在数据压缩领域的地位,本文将结合图 ...
import renumber.py in pymol
cp renumber.py /usr/local/lib/python2.7/dist-packages/pymol import renumber or run /path/to/renumber ...
python gettitle.py
#!/usr/bin/env python # coding=utf-8 import threading import requests import Queue import sys import ...
解决 odoo.py: error: option --addons-path: The addons-path 'local-addons/' does not seem to a be a valid Addons Directory!
情况说明 odoo源文件路径-/odoo-dev/odoo/: 我的模块插件路径 ~/odoo-dev/local-addons/my-module 在my-module中创建了__init__.py ...
caffe机器学习自带图片分类器classify.py实现输出预测结果的概率及caffe的web_demo例子运行实例
caffe机器学习环境搭建及python接口编译参见我的上一篇博客:机器学习caffe环境搭建--redhat7.1和caffe的python接口编译 1.运行caffe图片分类器python接口还 ...
【转】Windows下使用libsvm中的grid.py和easy.py进行参数调优
libsvm中有进行参数调优的工具grid.py和easy.py可以使用,这些工具可以帮助我们选择更好的参数,减少自己参数选优带来的烦扰. 所需工具:libsvm.gnuplot 本机环境:Windo ...
MySqlNDB使用自带的ndb_setup.py安装集群
在用Mysql做集群时,使用Mysql的NDB版本更易于集群的扩展,稳定和数据的实时性. 我们可以使用Mysql自带的工具进行集群安装与管理:ndb_setup.py.位于Mysql的安装目录bin下 ...

随机推荐

Angular Service入门
1.Angular内置service Angular为了方便开发者开发,本身提供了非常多的内置服务.可以通过https://docs.angularjs.org/api/ng/service查看Ang ...
iOS开发之"省市"二级联动的数据组织(PHP版)以及PickerView的实现与封装
之所以要发表这篇博客,还源于最近的开发工作所实现的一个小的Demo, 当然这个Demo不会涉及工作中App的一些内容,下方要实现的Demo是通用的.因为项目需求的迭代,要求在银行卡绑定中添加支行所在的 ...
无限循环轮播图之结构布局（原生JS）
html部分 <div class="box" id="box"> <ul> <li><img src="i ...
Oracle 11g DG配置简明版
环境: 主库A机:在线生产环境,RHEL 6.4 + Oracle 11.2.0.3 备库B机:新增备机,RHEL 6.4 需求: 对生产环境最小影响前提下配置DG备库. 目录: 一.B机安装相同版本 ...
学会给你的类(及成员)来定制一套自己的Attribute吧
在通过Visual Studio创建的C#程序集中,都包含了一个AssemblyInfo.cs的文件,在这个文件中,我们常常会看到这样的代码 [assembly: AssemblyTitle(&quo ...
C#/ASP.NET完善的DBHelper，配套Model生成器
支持Oracle.MSSQL.MySQL.SQLite四种数据库,支持事务,支持对象关系映射:已在多个项目中实际使用. 没有语法糖,学习成本几乎为0,拿来即用. DBHelper类完整代码: usin ...
C#限速下载网络文件
代码: using System; using System.Collections.Concurrent; using System.Collections.Generic; using Syste ...
wpf 列表、菜单收起与展开，通过Grid DoubleAnimation或者Expander实现
菜单收缩有很多种方法具体如何实现还是看个人想法: 第一种通过后台控制收起与展开: 效果图: 代码 : <Grid> <Grid.ColumnDefinitions> <C ...
Mysql性能优化二
接上一篇Mysql性能优化一建立适当的索引说起提高数据库性能,索引是最物美价廉的东西了.不用加内存,不用改程序,不用调sql,只要执行个正确的'create index',查询速度就可能提高百倍千 ...
子div设置浮动无法把父div撑开。
<div class="mainBox"> <div class="leftBox"></div> <div clas ...

LZ77.py

LZ77.py的更多相关文章

随机推荐

热门专题