哈夫曼编码压缩原理:由于每个字符在内存中都是以ASCII码进行存储,所以每个字符都占用了八个01位,利用哈夫曼树对每个字符进行01编码,根据字符在文章中出现的频率调整01串长度,出现频率高的字符在哈夫曼树中的权重大,编码后01串短,从而使得最终计算出的平均编码长度小于8,在本代码中平均编码长度约为4.72,压缩率约为59%,从而达到压缩文本的目的。
// HuffmanEncode.cpp: 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include<string>
int length; //文章长度
//定义结构体统计单个字符出现次数
struct Char_Frequency
{
char c;
int Frequency;
};
Char_Frequency a[10000]; //建立结构体数组 存储每个字符的出现次数
/*从文本文件中统计各字符出现次数*/
void Char_Probability_Fromfile(FILE *fp)
{
length = 0;
for (int i = 0; i < 100; i++) //对结构体数组初始化
{
a[i].c = NULL;
a[i].Frequency = 0;
}
char ch;
while (fscanf(fp, "%c", &ch) == 1)
{
length++;
int i = 0, flag = 0;
while (a[i].c != NULL)
{
if (a[i].c == ch)
{
a[i].Frequency++;
flag = 1;
}
i++;
}
if (!flag)
{
a[i].c = ch;
a[i].Frequency = 1;
}
}
}
/*哈夫曼树存储结构*/
typedef struct {
int weight;
char c;
int lchild;
int rchild;
int parent;
}HTNODE;
typedef HTNODE HuffmanT[10000];
HuffmanT T;
/*初始化哈夫曼树*/
void InitHT()
{
for (int i = 0; i < 100; i++)
{
T[i].c = NULL;
T[i].lchild = -1;
T[i].rchild = -1;
T[i].parent = -1;
T[i].weight = NULL;
}
}
/*为哈夫曼树加载权值*/
void InputW(int n)
{
for (int i = 0; i < n; i++)
{
T[i].c = a[i].c;
T[i].weight = a[i].Frequency;
}
for (int i = 0; i < n; i++)
{
printf("权重初始为:%c %ld\n", T[i].c, T[i].weight);
}
}
/*找到两个最小权值节点*/
void SelectMin(int n, int *p1, int *p2)
{
int i, j;
for (i = 0; i < n; i++)
{
if (T[i].parent == -1)
{
*p1 = i;
break;
}
}
for (j = i + 1; j < n; j++)
{
if (T[j].parent == -1)
{
*p2 = j;
break;
}
}
for (i = 0; i < n; i++)
{
if ((T[*p1].weight > T[i].weight) && (T[i].parent == -1) && (*p2 != i))
*p1 = i;
}
for (j = 0; j < n; j++)
{
if ((T[*p2].weight > T[j].weight) && (T[j].parent == -1) && (*p1 != j))
*p2 = j;
}
}
/*哈夫曼树构造算法,n为有权值的节点个数*/
void CreateHT(int n)
{
int i, p1, p2;
InitHT();
InputW(n);
for (i = n; i < 2 * n; i++)
{
SelectMin(i, &p1, &p2);
T[p1].parent = T[p2].parent = i;
T[i].lchild = p1;
T[i].rchild = p2;
T[i].weight = T[p1].weight + T[p2].weight;
}
}
/*哈夫曼编码表的存储结构*/
typedef struct {
char ch;
char bits[1000];
}CodeNode;
typedef CodeNode HuffmanCode[100000];
HuffmanCode H;
/*哈夫曼编码算法实现*/
void CharSetHuffmanEncoding(int n)
{
int c, p, i;
char cd[10000];
int start;
cd[n] = '\0';
for (i = 0; i < n; i++)
{
H[i].ch = T[i].c;
start = n;
c = i;
while ((p = T[c].parent) >0)
{
cd[--start] = (T[p].lchild == c) ? '0' : '1';
c = p;
}
strcpy(H[i].bits, &cd[start]);
}
}
char copy[100000000];
/*编码写入文件*/
void Encode(int n)
{
FILE *fp = fopen("D:\\test2.txt", "r");
FILE *fp2 = fopen("D:\\test2w.txt", "w");
char ch;
while (fscanf(fp, "%c", &ch) == 1)
{
for (int i = 0; i < n; i++)
{
if (H[i].ch == ch)
{
fprintf(fp2, "%s", H[i].bits);
strcat(copy, H[i].bits);
}
}
}
}
/*解码写入文件*/
void Decode(int n)
{
FILE *fp = fopen("D:\\decode.txt", "w");
int root,p, i, j = 0;
p = root = 2 * n - 1;
for (i = 0; i < strlen(copy); i++)
{
if (copy[i] == '0')
{
p = T[p].lchild;
}
else if (copy[i] == '1')
{
p = T[p].rchild;
}
if (T[p].lchild == -1 && T[p].rchild == -1)
{
fprintf(fp, "%c", T[p].c);
p = root;
}
}
}
/*求压缩率*/
void Encode_Rate(int n)
{
float WPL=0;
for (int i = 0; i < n; i++)
{
WPL += (float)strlen(H[i].bits)*((float)((float)a[i].Frequency / (float)length));
}
printf("压缩率为: %f\n", WPL/8.00);
}
int main()
{
FILE *fp = fopen("D:\\test2.txt", "r");
Char_Probability_Fromfile(fp);
int i = 0;
while (a[i].c != NULL)
{
printf("%c %d %d", a[i].c, a[i].c, a[i].Frequency);
printf("\n");
i++;
}
printf("i为:%d\n", i);
CreateHT(i);
int n = 0;
while (T[n].weight != NULL)
{
printf("序号:%d %c 权重:%d 父母:%d 左儿子:%d 右儿子:%d \n", n, T[n].c, T[n].weight, T[n].parent, T[n].lchild, T[n].rchild);
n++;
}
printf("-------\n");
CharSetHuffmanEncoding(i);
n = 0;
Encode(i);
Decode(i);
printf("\n");
Encode_Rate(i);
return 0;
}
附上运行结果图片:
压缩译码前:
压缩译码后:
译码后,恢复原文: