1. 赫夫曼编码 - 最优二叉树
大幅减少字符串编码成二进制数的长度 – 比固定长度ASCII编码减少50%
字符编码:将字符转成二进制进行网络传输
字符编码方法
1、 纯ascii编码-每个字符都转成固定8位二进制--占用空间大;
2、 计算句子同样字符出现个数,自己定义不定长的二进制位数进行表示字符,一啊不般出现次数越多,位数越少-不固定编码,占用空间小,但编码会冲突;
3、 赫夫曼编码:利用赫夫曼树进行编码字符--不固定编码,大大减少空间,但编码不冲突;
赫夫曼编码流程
1、 计算目标字符串中各个字符出现的次数;
2、 将步骤1重构成最优二叉树-节点的每个权重代表字符出现的次数;
3、 字符编码:根节点到某个字符节点的路径(左为0、右为1);
4、 根据3步骤的字符编码,将目标字符串转成二进制编码;
OptimalBinaryTree.java
/**
* @Classname OptimalBinaryTree
* @Description
* @Date 2022/4/13 10:48
* @Created by lrc
*/
public class OptimalBinaryTree<T> {
//二叉树根节点 - 根据datas进行组建
Node<T> root;
//构造器传入的数据
T[] datas;
//传数组构造最优二叉树
OptimalBinaryTree(@NotNull T...datas) {
T[] copyData = Arrays.copyOf(datas, datas.length);
Arrays.sort(copyData);
// System.out.println(Arrays.toString(copyData));
datas = copyData;
createdOptimalBinaryTree(datas);
}
public Node<T> getRoot() {
return root;
}
// 最优二叉树 - 中序遍历
public String inOrder() {
StringBuilder sb = new StringBuilder();
sb.append("[");
//真正的中序遍历
root.inOrder(root,sb);
sb.replace(sb.lastIndexOf(", "), sb.length(), "");
sb.append("]");
return sb.toString();
}
// 最优二叉树 - 前序遍历
public String preOrder() {
StringBuilder sb = new StringBuilder();
sb.append("[");
//真正的前序遍历
root.preOrder(root,sb);
sb.replace(sb.lastIndexOf(", "), sb.length(), "");
sb.append("]");
return sb.toString();
}
@Override
public String toString() {
return preOrder();
}
private boolean createdOptimalBinaryTree(T[] datas) {
LinkedList<Node<T>> list = new LinkedList<>();
boolean flag = false;
Node<T> newNode;
for(T data : datas) {
newNode = new Node<>();
newNode.setData(data);
list.add( newNode );
}
while(true) {
//弹出两个子节点,构成左右子树
Node<T> lNode = list.pop();
Node<T> rNode = list.pop();
//父节点权重是上面两个子节点的权重
Node<T> parNode = new Node<>();
parNode.weight = lNode.weight + rNode.weight;
parNode.leftNode = lNode;
parNode.rightNode = rNode;
//父节点必须往链表头添加 - 否则遇到weigth相同时,组件二叉树会混乱
list.addFirst(parNode);
Collections.sort(list);
//打印当前链表情况
// System.out.println("遍历" + list);
//如果链表中只有一个元素,说明这个节点就是根节点
if(list.size() == 1) {
root = list.get(0);
flag = true;
break;
}
}
return flag;
}
protected class Node<T> implements Comparable<Node> {
T data;
Integer weight;
Node<T> leftNode;
Node<T> rightNode;
public T getData() {
return data;
}
public Node<T> getLeftNode() {
return leftNode;
}
public void setLeftNode(Node<T> leftNode) {
this.leftNode = leftNode;
}
public Node<T> getRightNode() {
return rightNode;
}
public void setRightNode(Node<T> rightNode) {
this.rightNode = rightNode;
}
public void preOrder(Node node, StringBuilder sb) {
sb.append(node.toString() + ", ");
if(node.leftNode != null) {
preOrder(node.leftNode, sb);
}
if(node.rightNode != null) {
preOrder(node.rightNode, sb);
}
}
public void inOrder(Node node, StringBuilder sb) {
if(node.leftNode != null) {
inOrder(node.leftNode, sb);
}
sb.append(node.toString() + ", ");
if(node.rightNode != null) {
inOrder(node.rightNode, sb);
}
}
public void setData(T data) {
this.data = data;
//节点的权重硬性规定为数据data内部的权重
try {
Field field = data.getClass().getDeclaredField("weight");
field.setAccessible(true);
weight = (Integer) field.get(data);
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public String toString() {
return "Node{" +
"data=" + data +
", weight=" + weight +
'}';
}
// node排序必须要实现这个函数 大于实参则大于0 小于则小于0 等于则等于0
@Override
public int compareTo(Node node) {
return this.weight - node.weight;
}
}
public static void main(String[] args) {
//1. 数据准备 - 为什么写那么麻烦,主要是写熟悉一些数组、链表之间的转换函数
Student student1 = new Student(1, "lrc", 8);
Student student2 = new Student(2, "qcj", 2);
Student student3 = new Student(3, "ert", 10);
Student student4 = new Student(4, "bfg", 5);
Student student5 = new Student(5, "ytu", 25);
Student[] stu = {
student1,student2,student3,student4,student5};
// 2 5 8 10 25
// 7 8 10 25
// 10 15 25
// 25 25
// 50
//2. 构造最优二叉树
OptimalBinaryTree<Student> tree = new OptimalBinaryTree(stu);
//3. 前序遍历二叉树
System.out.println(tree);
}
}
HuffmanCode.java
package top.linruchang.algorithm.tree;
import java.util.*;
/**
* @Classname HuffmanCode
* @Description
* @Date 2022/4/13 21:27
* @Created by lrc
*/
public class HuffmanCode {
// 根据data字符个数构建赫夫曼二叉树
OptimalBinaryTree<MyCharacter> tree;
//需要被赫夫曼编码的原字符串
String data;
//赫夫曼编码字符串data
String encodedData;
//压缩encodedData的数据
byte[] encodedDataArray;
//存储data字符串对应的每个字符个数记录
HashMap<Character, MyCharacter> charMap;
//根据tree获取对应字符的赫夫曼编码
HashMap<Character, String> codeMap = new HashMap<>();
HuffmanCode(String str) {
//需要被赫夫曼编码的原字符串
this.data = str;
// 1. 计算字符串各个字符出现的次数
initCharMap(str);
System.out.println("每个字符出现的次数:" + charMap);
// 2. 构造最优二叉树
MyCharacter[] myCharacters = new MyCharacter[charMap.size()];
charMap.values().toArray(myCharacters);
tree = new OptimalBinaryTree<>(myCharacters);
System.out.println("前序遍历赫夫曼树:" + tree);
//3. 根据tree获取对应字符的赫夫曼编码
initHuffmanCode(tree.getRoot(), "", new StringBuilder());
System.out.println("编码转换规则" + codeMap);
//4. 根据codeMap字符编码规则将data字符串转成二进制编码
EncodingData();
System.out.println("原字符串:" + data);
System.out.println("赫夫曼编码后的字符串:" + encodedData);
//5. 压缩encodedData
zipCode();
}
public HashMap<Character, String> getCodeMap() {
return codeMap;
}
/**
* 解码
* @param binaryMessg 字节数组
* @param codeMap 赫夫曼编码规则
* @return 返回原字符串
*/
public static String decode(byte[] binaryMessg, Map< Character, String> codeMap) {
String encodedDataByBytes = getEncodedDataByBytes(binaryMessg, codeMap);
return decode(encodedDataByBytes, codeMap);
}
/**
*
* @param binaryMessg 二进制字符串
* @param codeMap 赫夫曼编码规则
* @return 返回原字符串
*/
public static String decode(String binaryMessg, Map< Character, String> codeMap) {
StringBuilder textContent = new StringBuilder();
Set<Map.Entry<Character, String>> entries = codeMap.entrySet();
Integer startIndex = 0;
while(true) {
String code;
Integer tempEndIndex;
for(Map.Entry<Character, String> map: entries) {
code = map.getValue();
tempEndIndex = startIndex + code.length();
if(tempEndIndex > binaryMessg.length()) {
continue;
}
String substr = binaryMessg.substring(startIndex, tempEndIndex);
if(code.equals(substr)) {
textContent.append(map.getKey());
startIndex = tempEndIndex;
break;
}
}
if(startIndex >= binaryMessg.length()) {
break;
}
}
return textContent.toString();
}
/**
*
* @param bytes 字节数组 - 存储了赫夫曼编码的信息
* @param codeMap 匹配规则
* @return 返回赫夫曼编码的二进制信息 - 根据这段信息进行解码
*/
public static String getEncodedDataByBytes(byte[] bytes, Map< Character, String> codeMap) {
StringBuilder sb = new StringBuilder();
int j = 0;
for(byte b : bytes) {
String binStr = Integer.toBinaryString(b);
if(binStr.length() <= 8) {
if(j == bytes.length -1) {
sb = adjustString(sb, new StringBuilder(binStr), codeMap);
break;
}
for(int i = 0; i<8-binStr.length(); i++) {
sb.append("0");
}
sb.append(binStr);
}
if(binStr.length() > 8) {
sb.append(binStr.substring(binStr.length()-8));
}
j++;
}
return sb.toString();
}
/**
* 这段代码别看了,实在是太冗余,写完我都不想在看它,懒得重构
* 修改最后一段二进制与前面一段二进制 - 因为Integer.toBinaryString得到的补码会忽略到前面的0,故需要补全
* @param prefixSb
* @param suffixSb
* @param codeMap
* @return
*/
public static StringBuilder adjustString(StringBuilder prefixSb, StringBuilder suffixSb, Map< Character, String> codeMap) {
//1. 找到前半段没有匹配的二进制字符
Integer startIndex = 0;
Integer endIndex = 0;
String prefixStr = prefixSb.toString();
String suffixStr = suffixSb.toString();
Collection<String> codes = codeMap.values();
Integer count = 0;
while(true) {
++count;
//遍历查询前缀字符对应赫夫曼编码
//说明持续验证最后一段都不能找到对应的赫夫曼编码
if(count == 2) {
break;
}
for(String code : codes) {
Integer curEndIndex = startIndex + code.length();
//截图的结束索引大于被截取字段本身 - 则忽略该次查询
if(curEndIndex > prefixSb.length()) {
continue;
}
String substr = prefixStr.substring(startIndex, curEndIndex);
if(code.equals(code)) {
startIndex = curEndIndex;
endIndex = curEndIndex;
count = 0;
break;
}
}
//前半段都符合编码情况
if(startIndex >= prefixSb.length()) {
break;
}
}
// 2.找到赫夫曼编码符合修复条件的几个编码
String waitPrefixStr = null;
List<String> waitPrefixStrs = new ArrayList<String>();
if(startIndex != prefixStr.length()) {
waitPrefixStr = prefixStr.substring(startIndex);
System.out.println("waitPrefix:" + waitPrefixStr);
//查找修复前缀字段等待字段
for(String code : codes) {
//只有短过赫夫曼编码的才是需要被修复,否则不用修复
if(code.length() > waitPrefixStr.length()) {
boolean isMatch = code.substring(0,waitPrefixStr.length()).matches(waitPrefixStr);
if(isMatch) {
//waitPrefixStr只有末尾等于有0才用修复,否则不用修复
if(code.charAt(waitPrefixStr.length()) == '0') {
waitPrefixStrs.add(code);
}
}
}
}
}
//3. 真正的修复前半段 - 利用waitPrefixStrs进行依依比对
if(waitPrefixStrs.size() > 0) {
String copyWaitPrefixStr = new String(waitPrefixStr);
for(String repairTarget : waitPrefixStrs) {
for(int index = waitPrefixStr.length(); index < repairTarget.length(); index++) {
if(repairTarget.charAt(index) == '0') {
copyWaitPrefixStr += "0";
}else {
break;
}
}
String tempSuffixStr = copyWaitPrefixStr + suffixStr;
Integer sufixStartIndex = 0;
Integer suffixCount = 0;
//验证当前的copyWaitPrefixStr是否是需要的
boolean isValid = false;
while(true) {
++suffixCount;
if(suffixCount == 2) {
break;
}
for(String code : codes) {
Integer curSuffixEndIndex = sufixStartIndex + code.length();
if(curSuffixEndIndex > tempSuffixStr.length()) {
continue;
}
if(tempSuffixStr.substring(sufixStartIndex, curSuffixEndIndex).equals(code)) {
sufixStartIndex = curSuffixEndIndex;
suffixCount = 0;
break;
}
}
if(sufixStartIndex == tempSuffixStr.length()) {
isValid = true;
break;
}
}
if(isValid) {
for(int index = waitPrefixStr.length(); index < repairTarget.length(); index++) {
if(repairTarget.charAt(index) == '0') {
prefixSb.append("0");
}else {
break;
}
}
break;
}
}
}
return prefixSb.append(suffixSb);
}
public String getEncodedData() {
return encodedData;
}
private void zipCode() {
byte[] bytes;
//1.获取数组长度
Integer length = encodedData.length();
if (length % 8 == 0) {
bytes = new byte[length / 8];
} else {
bytes = new byte[length / 8 + 1];
}
//2.每8位截取一段字符(补码),将其转成10进制
Integer startIndex = 0;
Integer endIndex = 8;
Integer byteIndex = 0;
while (startIndex < length - 1) {
String binaryNum = encodedData.substring(startIndex, endIndex);
System.out.println(binaryNum);
Integer num = Integer.valueOf(binaryNum, 2);
bytes[byteIndex++] = num.byteValue();
startIndex += 8;
endIndex += 8;
if (endIndex > length) {
endIndex = length;
}
}
//3.返回字节数组
encodedDataArray = bytes;
}
public byte[] getEncodedDataBytes() {
return encodedDataArray;
}
public void EncodingData() {
char[] chars = new char[data.length()];
data.getChars(0, data.length(), chars, 0);
StringBuilder sb = new StringBuilder();
for (char curChar : chars) {
String code = codeMap.get(curChar);
sb.append(code);
}
encodedData = sb.toString();
}
/**
* 根据赫夫曼树 - 获取字符赫夫曼编码
*
* @param node 当前节点
* @param signLR 标记是左节点还是右节点 0:左节点 1:右节点
* @param sb 存储到达路径 --根据上述signLR存储 -- 字符赫夫曼编码
*/
private void initHuffmanCode(OptimalBinaryTree.Node node, String signLR, StringBuilder sb) {
StringBuilder sb2 = new StringBuilder(sb);
sb2.append(signLR);
if (node != null) {
if (node.data == null) {
initHuffmanCode(node.leftNode, "0", sb2);
initHuffmanCode(node.rightNode, "1", sb2);
} else {
Character curCharacter = ((MyCharacter) node.getData()).getCharacter();
codeMap.put(curCharacter, sb2.toString());
}
}
}
//计算原字符串各个字符出现的次数
private void initCharMap(String str) {
char[] chars = new char[str.length()];
str.getChars(0, str.length(), chars, 0);
charMap = new HashMap<>();
for (char curChar : chars) {
boolean hasKey = charMap.containsKey(curChar);
MyCharacter curCharacter;
if (hasKey) {
curCharacter = charMap.get(curChar);
Integer count = curCharacter.getCount();
curCharacter.setCount(++count);
} else {
curCharacter = new MyCharacter();
curCharacter.setCharacter(curChar);
curCharacter.setCount(1);
charMap.put(curChar, curCharacter);
}
}
}
//内部类 - 用来存储原字符串对应的字符以及出现次数、以及节点权重
private class MyCharacter implements Comparable<MyCharacter> {
Character character;
Integer asciiValue;
Integer count;
Integer weight;
public Character getCharacter() {
return character;
}
public void setCharacter(Character character) {
this.character = character;
this.asciiValue = (int) character.charValue();
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
this.weight = count;
}
@Override
public String toString() {
return "MyCharacter{" +
"character=" + character +
", asciiValue=" + asciiValue +
", count=" + count +
'}';
}
@Override
public int compareTo(MyCharacter orderCharacter) {
return this.count - orderCharacter.count;
}
}
public static void main(String[] args) {
// String str = "abcfdef tratr egcs";
String str = "I love you";
HuffmanCode code = new HuffmanCode(str);
System.out.println("赫夫曼编码后二进制转成字节数组:" + Arrays.toString(code.getEncodedDataBytes()));
//通过字节数组进行节码
byte[] bytes = code.getEncodedDataBytes();
//将字节数组转成二进制字符串 - 通过二进制字符串转成原字符串明文
String encodeStr = HuffmanCode.getEncodedDataByBytes(bytes, code.getCodeMap());
System.out.println("赫夫曼编码二进制字符串:" + encodeStr);
System.out.println(code.decode(encodeStr, code.getCodeMap()));
System.out.println(code.decode(bytes, code.getCodeMap()));
Integer length1 = str.length() * 8;
Integer length2 = code.getEncodedData().length();
double redPer = (length1 - length2) / (double) length1;
System.out.printf("固定长度ascii编码长度length1:%d,赫夫曼编码后长度length2:%d,所以减少率为:%f\n", length1, length2, redPer);
}
}