文件压缩与解压思想:

1)统计字符出现次数,结合最小堆的性质生成哈夫曼树;

2)对叶节点进行编码,结点左边编0,右边编1;

3)读取文件,用哈夫曼编码代替字符产生新的字符,即压缩;

4)读取压缩文件,进行哈夫曼编码的解读产生相应字符,即解压;

例如,对以下数据生成哈夫曼树,以及产生相应的哈夫曼编码:

Compressed TAR compressed tar file_哈夫曼树

Compressed TAR compressed tar file_文件压缩与解压_02

//自己写的最小堆(因为在加强巩固,所以没用STL里的)
#pragma once
#include<iostream>
#include<vector>
using namespace std;
template<class T>
struct Less
{
	bool operator()(const T&left, const T&right)
	{
		return left->_weight < right->_weight;
	}
};

template<class T,class Compare=Less<T>>
class Heap
{
public:
	Heap()
	{}
	Heap(const T*v, int size, int valid)
	{
		for (int i = 0; i < size; i++)
		{
			if (v[i] != valid)
				_v.push_back(v[i]);
		}
		int _size = _v.size();
		int begin = (_size - 2) / 2;
		for (int root = begin; root >= 0; root--)
		{
			AdjustDown(root);
		}
	}
	void Insert(const T& value)
	{
		_v.push_back(value);
		int leaf = _v.size() - 1;
		AdjustUp(leaf);
	}
	void Remove()
	{
		swap(_v[0], _v[_v.size() - 1]);
		_v.pop_back();
		AdjustDown(0);
	}
	T& GetTop()
	{
		if (!_v.empty())
			return _v[0];
	}
	bool Empty()
	{
		if (_v.empty())
			return true;
		return false;
	}
	int Size()
	{
		return _v.size();
	}
protected:
	void AdjustDown(int root)
	{
		size_t left = root * 2 + 1;
		size_t right = left + 1;
		int key = left;
		while (left < _v.size())
		{
			if (right < _v.size() && Compare()(_v[right], _v[left]))
				key = right;
			if (Compare()(_v[key], _v[root]))
			{
				swap(_v[key], _v[root]);
				root = key;
				left = root * 2 + 1;
				right = left + 1;
				key = left;
			}
			else
				break;
		}
	}
	void AdjustUp(int leaf)
	{
		int root = (leaf - 1) / 2;
		while (leaf > 0)
		{
			if (Compare()(_v[leaf], _v[root]))
			{
				swap(_v[leaf], _v[root]);
				leaf = root;
				root = (leaf - 1) / 2;
			}
			else
				break;
		}
	}
private:
	vector<T> _v;
};



//哈夫曼树的建立
#pragma once
#include<iostream>
#include<string>
#include"Heap.h"
using namespace std;
template<class T>
struct HuffmanTreeNode
{
	HuffmanTreeNode<T>* _left;
	HuffmanTreeNode<T>* _right;
	T _weight;
	HuffmanTreeNode(const T& weight)
		:_left(NULL)
		, _right(NULL)
		, _weight(weight)
	{}
};
template<class T>
class HuffmanTree
{
	typedef HuffmanTreeNode<T> Node;
public:
	HuffmanTree(const T*a, int size, T invaild)
	{
		_CreatHuffmanTree(a, size, invaild);
	}
	Node* GetRoot()
	{
		return _root;
	}
protected:
	void _CreatHuffmanTree(const T*a,int size,T invaild)
	{
		Heap<Node*,Less<Node*>>hp;
		for (int i = 0; i < size; i++)
		{
			if (a[i] != invaild)
				hp.Insert(new Node(a[i]));//建立小堆
		}
		//当_v中只剩下一个数据时,哈弗曼树建立完成
		while (hp.Size()>1)
		{
			Node* left = hp.GetTop();
			hp.Remove();
			Node* right = hp.GetTop();
			hp.Remove();
			Node*parent = new Node(left->_weight + right->_weight);
			parent->_left = left;
			parent->_right = right;
			hp.Insert(parent);
		}
		_root = hp.GetTop();
	}
private:
	Node* _root;
};



//文件压缩与解压
#pragma once
#include<iostream>
#include<string>
using namespace std;
#include"HuffmanTree.h"
struct CharInfo
{
	unsigned char _ch;
	long _count;
	string _code;
	CharInfo(const long count = 0)
		:_count(count)
	{}
	CharInfo(const unsigned char ch)
		:_ch(ch)
	{}
	long operator+(const CharInfo& info)const
	{
		return _count + info._count;
	}
	bool operator!=(const CharInfo& info)const
	{
		return _count != info._count;
	}
	bool operator<(const CharInfo& info)const
	{
		return _count < info._count;
	}
};
class FileCompress
{
public:
	//对文件filename进行压缩
	void Compress(const char* filename)
	{
		FILE* fread = fopen(filename, "rb");
		if (fread == NULL)
		{
			cout << "打开文件失败..." << endl;
			return;
		}
		//打开文件成功
		//1、字符与下标对应,填充结构体的_ch
		for (int i = 0; i < 256; i++)
		{
			_Info[i]._ch = i;
		}

		//2、统计各个字符出现的次数
		unsigned char ch = fgetc(fread);
		while (!feof(fread))
		{
			_Info[ch]._count++;
			ch = fgetc(fread);
		}

		//3、建立哈夫曼树
		HuffmanTree<CharInfo> hft(_Info, 256, CharInfo());

		//4、获取各个字符的哈夫曼编码
		//每次都从根节点开始,走到叶节点,即得到叶节点对应字符的编码
		HuffmanTreeNode<CharInfo>*root = hft.GetRoot();
		string code;
		GetHuffmanCodeOfChar(root, code);

	


		//5、从头开始读文件,把读取字符的编码重新组合写入压缩文件中
		fseek(fread, 0, SEEK_SET);
		//压缩产生的文件的名字
		string fcp = filename;
		fcp += ".compress";
		FILE* fwrite = fopen(fcp.c_str(), "wb");
		if (fwrite == NULL)
		{
			cout << "compress 文件打开失败" << endl;
			return;
		}

		//打开文件成功
		unsigned char data = 0;//写入压缩文件的字符
		int offset = 7;//偏移量
		ch = fgetc(fread);
		while (!feof(fread))
		{
			const char* str = _Info[ch]._code.c_str();//获取当前字符的编码
			while (*str != '\0')
			{
				if (offset >= 0)
				{
					data = data | ((*str - '0') << offset);
					offset--;
				}
				if (offset < 0)
				{
					fputc(data, fwrite);
					data = 0;
					offset = 7;
				}
				str++;
			}
			ch = fgetc(fread);
		}
		fputc(data, fwrite);
		WriteConfig(filename);
		fclose(fread);
		fclose(fwrite);
	}
	void UnCompress(const char* filename)
	{
		string fcp = filename;
		fcp += ".compress";
		FILE* fread = fopen(fcp.c_str(), "rb");
		if (fread == NULL)
		{
			cout << "打开文件失败..." << endl;
			return;
		}
		//打开文件成功
		//3、建立哈夫曼树
		CharInfo info[256];
		ReadConfig(filename, info);
		HuffmanTree<CharInfo> hft(info, 256, CharInfo());

		string fucp = filename;
		fucp += ".uncompress";
		FILE* fwrite = fopen(fucp.c_str(), "wb");

		HuffmanTreeNode<CharInfo>*root = hft.GetRoot();
		HuffmanTreeNode<CharInfo>*cur = root;//每次都要从根开始读,读到叶节点即可获取一个原字符
		long DataTotal = (root->_weight)._count;//原文件中的字符总数
		unsigned char ch = fgetc(fread);
		while (DataTotal)
		{
			int tmp = 1;
			int offset = 7;
			while (offset >= 0)
			{
				if (ch&(1 << offset))//检验位是否为0
				{
					cur = cur->_right;
					offset--;
				}
				else
				{
					cur = cur->_left;
					offset--;
				}
				if (cur->_left == NULL&&cur->_right == NULL)
				{
					unsigned char wch = cur->_weight._ch;
					fputc(wch, fwrite);
					cur = root;
					DataTotal--;
					//最后一个字符的编码在最后两个字节当中的情况
					if (!DataTotal)  
					{
						break;
					}
				}
			}
			ch = fgetc(fread);
		}
		fclose(fread);
		fclose(fwrite);
	}
protected:
	void WriteConfig(const char*filename)
	{
		string Config = filename;
		Config += "config";
		FILE* fwrite = fopen(Config.c_str(), "wb");
		if (fwrite == NULL)
		{
			cout << "打开文件失败" << endl;
			return;
		}
		//打开成功
		for (int i = 0; i < 256; i++)
		{
			if (_Info[i] ._count)
			{
				fputc(_Info[i]._ch, fwrite);
				fputc(',', fwrite);
				char count[100];
				//参数说明:1.要转换的值;2.结果存放区;3.进制
				_itoa(_Info[i]._count, count, 10);
				fputs(count, fwrite);
				fputc(',', fwrite);
				fputs(_Info[i]._code.c_str(), fwrite);
				fputc('\n', fwrite);
			}
		}
		fclose(fwrite);
	}
	void ReadConfig(const char*filename,CharInfo *info)
	{
		string Config = filename;
		Config += "config";
		FILE* fread = fopen(Config.c_str(), "rb");
		if (fread == NULL)
		{
			cout << "打开文件失败" << endl;
			return;
		}
		//打开成功
		char str[100];
		while (fgets(str, 100, fread))
		{
			char*pstr = str;
			unsigned char ch = (unsigned char)*pstr;//得到字符
			info[ch]._ch = ch;
			if (ch == '\n')
			{
				fgets(str, 100, fread);
				pstr=str;
				pstr++;
			}
			else
				pstr+=2;//跳过分隔符(逗号)
			//得到_count的字符串形式
			long count = 0;
			while (*pstr&&*pstr!=',')
			{
				count *= 10;
				count += *pstr - '0';
				pstr++;
			}
			info[ch]._count = count;//得到出现次数
			pstr++;
			string code(pstr);
			info[ch]._code = code;
		}
	}
	void GetHuffmanCodeOfChar(HuffmanTreeNode<CharInfo>*root, string& code)
	{
		if (root == NULL)
			return;
		if (root->_left == NULL&&root->_right == NULL)
		{
			unsigned char ch = root->_weight._ch;
			_Info[ch]._code = code;
			return;
		}
		GetHuffmanCodeOfChar(root->_left, code + '0');//左边编码为0
		GetHuffmanCodeOfChar(root->_right, code + '1');//右边编码为1
	}

private:
	CharInfo _Info[256];//结构体数组
};