Skip to content

Commit a6b3359

Browse files
author
Wosser1sProductions
committed
Huffman dictionary to stream
+ Huffman dictionary can now correctly be written to a stream, with minimal amount of bits + Huffman dictionary can be correctly read from stream TODO: Proper decoding function, dict to tree conversion, coe clean-up and integration with Encoder
1 parent 44ef9f3 commit a6b3359

File tree

4 files changed

+182
-59
lines changed

4 files changed

+182
-59
lines changed

Debug/Debug.bg

7.93 KB
Binary file not shown.

Huffman.cpp

Lines changed: 170 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,74 @@
55
#include "utils.hpp"
66
#include "Logger.hpp"
77

8+
////////////////////////////////////////////////////////////////////////////////
9+
/// Private functions
10+
////////////////////////////////////////////////////////////////////////////////
811

9-
template<class T>
10-
algo::Huffman<T>::Huffman(void) {
12+
/**
13+
* @brief A comparator to sort Codeword pairs by bit length.
14+
*/
15+
struct CodewordComparator {
16+
bool operator()(const std::pair<uint8_t, algo::Codeword>& first,
17+
const std::pair<uint8_t, algo::Codeword>& second)
18+
{
19+
return first.second.len > second.second.len;
20+
}
21+
};
1122

23+
/**
24+
* @brief Add the given settings to the output stream according to the amount of bits
25+
* specified in the Huffman class.
26+
* @param length
27+
* The length of the sequence that will follow this header.
28+
* If the length is 0, only one '0' bit will be written.
29+
* @param bit_length
30+
* The amount of bits needed for every data element in the sequence following this header.
31+
* Keys always use KEY_BITS as length, and values use bit_length, which is different for each group.
32+
* This is done to minimize the amoutn of bits needed to save the Huffman dictionary.
33+
* @param writer
34+
* The outputstream to write to.
35+
*/
36+
template<class T>
37+
void algo::Huffman<T>::add_huffman_dict_header(uint32_t length, uint32_t bit_length, util::BitStreamWriter& writer) {
38+
if (length > 0) {
39+
writer.put(algo::Huffman<T>::DICT_HDR_HAS_ITEMS_BITS + algo::Huffman<T>::DICT_HDR_SEQ_LENGTH_BITS,
40+
0x80 | (length & 0x7F)); // MSB is HAS_ITEMS setting + 7 bits length
41+
writer.put(algo::Huffman<T>::DICT_HDR_ITEM_BITS,
42+
bit_length & 0xF); // 4 bits for bit length of every dict item
43+
} else {
44+
writer.put_bit(0);
45+
}
1246
}
1347

48+
/**
49+
* @brief Read a dictionary header from the inputstream and set the given variables.
50+
*
51+
* @param reader
52+
* The inputstream to read from.
53+
* @param length
54+
* The length of the sequence that will follow this header (will be set).
55+
* @param bit_length
56+
* The amount of bits for every value element in the following sequence (will be set).
57+
*
58+
* @return Returns true if there is data after this header. (first bit was set)
59+
*/
1460
template<class T>
15-
algo::Huffman<T>::~Huffman(void) {
16-
this->deleteTree(this->tree_root);
61+
bool algo::Huffman<T>::read_huffman_dict_header(util::BitStreamReader& reader, uint32_t& length, uint32_t& bit_length) {
62+
if (reader.get_bit()) {
63+
length = reader.get(algo::Huffman<T>::DICT_HDR_SEQ_LENGTH_BITS);
64+
bit_length = reader.get(algo::Huffman<T>::DICT_HDR_ITEM_BITS);
65+
return true;
66+
}
67+
68+
return false;
1769
}
1870

71+
/**
72+
* @brief Deallocate every node in the given tree.
73+
* @param root
74+
* The node to start with and delete its children.
75+
*/
1976
template<class T>
2077
void algo::Huffman<T>::deleteTree(algo::Node<> *root) {
2178
if (root == nullptr) return;
@@ -34,30 +91,26 @@ void algo::Huffman<T>::deleteTree(algo::Node<> *root) {
3491
* The current stream of bits for a path in the tree.
3592
*/
3693
template<class T>
37-
size_t algo::Huffman<T>::buildDict(const algo::Node<> * const node, std::vector<bool> stream) {
94+
void algo::Huffman<T>::buildDict(const algo::Node<> * const node, std::vector<bool> stream) {
3895
if (node == nullptr) {
39-
return 0u;
96+
return;
4097
}
4198

4299
// Check if leaf
43100
if (node->left == nullptr && node->right == nullptr) {
44-
const uint32_t size = uint32_t(stream.size());
45-
46101
this->dict[node->data] = Codeword {
47102
std::accumulate(stream.begin(), stream.end(), uint32_t(0u),
48103
[=](uint32_t x, uint32_t y) { return (x << 1u) | y; }),
49-
size
104+
uint32_t(stream.size())
50105
};
51-
52-
return size;
53106
}
54107

55108
std::vector<bool> lstream(stream);
56109
lstream.push_back(false);
57110
stream.push_back(true);
58111

59-
return std::max(this->buildDict(node->left , lstream),
60-
this->buildDict(node->right, stream));
112+
this->buildDict(node->left , lstream);
113+
this->buildDict(node->right, stream);
61114
}
62115

63116
/**
@@ -89,6 +142,18 @@ void algo::Huffman<T>::decode(const algo::Node<> * const node, util::BitStreamRe
89142
}
90143
}
91144

145+
////////////////////////////////////////////////////////////////////////////////
146+
147+
template<class T>
148+
algo::Huffman<T>::Huffman(void) {
149+
150+
}
151+
152+
template<class T>
153+
algo::Huffman<T>::~Huffman(void) {
154+
this->deleteTree(this->tree_root);
155+
}
156+
92157
/**
93158
* @brief Encode bits of length sizeof(T) with Huffman encoding and
94159
* write the Huffman dict and the encoded data to an outputstream.
@@ -98,7 +163,7 @@ void algo::Huffman<T>::decode(const algo::Node<> * const node, util::BitStreamRe
98163
* @return Returns a new bitstream with the encoded data.
99164
*/
100165
template<class T>
101-
util::BitStreamWriter* algo::Huffman<T>::encode(util::BitStreamReader& reader) {
166+
util::BitStreamWriter* algo::Huffman<T>::encode(util::BitStreamReader& reader) {
102167
const size_t length = reader.get_size() * 8u;
103168

104169
// Calculate frequencies
@@ -115,6 +180,8 @@ util::BitStreamWriter* algo::Huffman<T>::encode(util::BitStreamReader& reader) {
115180

116181
for (const auto& pair: freqs) {
117182
pq.push(util::allocVar<algo::Node<>>(pair.first, pair.second));
183+
184+
util::Logger::WriteLn(std::string_format("%02X: %d", pair.first, pair.second), false);
118185
}
119186

120187
while (pq.size() > 1) {
@@ -128,67 +195,113 @@ util::BitStreamWriter* algo::Huffman<T>::encode(util::BitStreamReader& reader) {
128195

129196
this->tree_root = pq.top();
130197

131-
const size_t h_table_bits = this->buildDict(this->tree_root, std::vector<bool>());
132-
const size_t h_dict_total_length = (algo::Huffman<>::KEY_BITS + h_table_bits)
133-
* this->dict.size() // Every {key: val} pair
134-
+ algo::Huffman<>::KEY_BITS // Length of table itself
135-
+ algo::Huffman<>::SIZE_BITS; // Bits per value
198+
this->buildDict(this->tree_root, std::vector<bool>());
136199

137-
util::Logger::WriteLn(std::string_format("[Huffman] {key:%d, val:%d} for %d entries + %d hdr bits (%.1f total bytes).",
138-
algo::Huffman<>::KEY_BITS, h_table_bits, this->dict.size(),
139-
(algo::Huffman<>::KEY_BITS + algo::Huffman<>::SIZE_BITS),
140-
float(h_dict_total_length) / 8.0f));
200+
// Create new list with dict elements sorted by bit length for saving to stream
201+
// Sort the dictionary by value bit length
202+
std::vector<std::pair<uint8_t, algo::Codeword>> sorted_dict(this->dict.begin(), this->dict.end());
203+
std::sort(sorted_dict.begin(), sorted_dict.end(), CodewordComparator());
141204

142-
util::BitStreamWriter *writer = util::allocVar<util::BitStreamWriter>((h_dict_total_length + length) / 8 + 1);
205+
// Determine frequencies of each bit length with {bit_length: freq}
206+
std::unordered_map<uint32_t, uint32_t> bit_freqs;
207+
for (const auto& w : sorted_dict) {
208+
bit_freqs[w.second.len]++;
209+
}
143210

144-
writer->put(algo::Huffman<>::KEY_BITS , uint32_t(this->dict.size())); ///< Put table size
145-
writer->put(algo::Huffman<>::SIZE_BITS, uint32_t(h_table_bits)); ///< Put bit length of a table value
211+
// Calculate total needed length for dict
212+
size_t h_dict_total_length = (algo::Huffman<>::KEY_BITS * this->dict.size()) // Amount of bits needed for keys
213+
+ ((algo::Huffman<>::DICT_HDR_HAS_ITEMS_BITS + algo::Huffman<>::DICT_HDR_ITEM_BITS + algo::Huffman<>::DICT_HDR_SEQ_LENGTH_BITS)
214+
* bit_freqs.size()) // Amount of bits for each header
215+
+ 1; // Stop bit
216+
for (const auto& f : bit_freqs) {
217+
h_dict_total_length += f.first * f.second; // Amount of bits for each header group
218+
}
146219

147-
for (const auto& pair : this->dict) {
148-
writer->put(algo::Huffman<>::KEY_BITS, pair.first); // Put Key
149-
writer->put(h_table_bits, pair.second.word); // Put Val
220+
util::Logger::WriteLn(std::string_format("[Huffman] Dict{key:%d, val:*} for %d entries + hdr bits: %.1f total bytes.",
221+
algo::Huffman<>::KEY_BITS, this->dict.size(),
222+
float(h_dict_total_length) / 8.0f));
223+
224+
//*** Save the Huffman dictionary to a stream ***//
225+
util::BitStreamWriter *writer = util::allocVar<util::BitStreamWriter>((h_dict_total_length + length) / 8 + 1);
226+
uint32_t seq_len = 0u, bit_len = 0u;
227+
228+
// Add headers for each group of same length key:val pairs
229+
// and write them to the stream
230+
for (const auto& w : sorted_dict) {
231+
if (seq_len == 0) {
232+
// New group
233+
bit_len = w.second.len;
234+
seq_len = bit_freqs[bit_len];
235+
add_huffman_dict_header(seq_len, bit_len, *writer);
236+
}
237+
238+
writer->put(algo::Huffman<>::KEY_BITS, w.first); // Put Key
239+
writer->put(bit_len, w.second.word); // Put Val
240+
seq_len--;
150241
}
151242

243+
add_huffman_dict_header(0, 0, *writer);
244+
152245

153246
/*******************************************************************************/
154247

155248
/*ori*/
156249
reader.set_position(0);
157-
while(reader.get_position() != length) {
158-
const T word = T(reader.get(algo::Huffman<>::KEY_BITS));
159-
util::Logger::Write(std::string_format("%X", word), false);
160-
} util::Logger::WriteLn(std::string_format(" (%d bytes)", length/8), false);
250+
// while(reader.get_position() != length) {
251+
// const T word = T(reader.get(algo::Huffman<>::KEY_BITS));
252+
// util::Logger::Write(std::string_format("%X", word), false);
253+
// }
254+
util::Logger::WriteLn(std::string_format(" (%d bytes)", length/8), false);
161255

162256
/*encoded*/
257+
// Encode
163258
reader.set_position(0);
164259
while(reader.get_position() != length) {
165260
const T word = T(reader.get(algo::Huffman<>::KEY_BITS));
166-
util::Logger::Write(std::string_format("%X", this->dict[word]), false);
167-
168261
writer->put(this->dict[word].len, this->dict[word].word); //TODO
169-
} util::Logger::WriteLn("", false);
262+
}
263+
264+
/*encoded stream*/
265+
size_t len = writer->get_position() / 8;
266+
// for (size_t i = 0; i < len; i++) {
267+
// util::Logger::Write(std::string_format("%X", writer->get_buffer()[i]), false);
268+
// }
269+
util::Logger::WriteLn(std::string_format(" (%d bytes)", len), false);
170270

171271
/*decoded*/
172272
util::BitStreamReader enc(writer->get_buffer(), (writer->get_position() / 8) + 1);
173-
size_t table_size = enc.get(algo::Huffman<>::KEY_BITS);
174-
size_t entry_bits = enc.get(algo::Huffman<>::SIZE_BITS);
175-
enc.set_position(enc.get_position() + (algo::Huffman<>::KEY_BITS + entry_bits) * table_size);
176273

177-
util::BitStreamWriter out(length/8);
274+
// readDictFromStream(enc);
275+
uint32_t dseq_len = 0u, dbit_len = 0u;
276+
this->dict.clear();
277+
// this->deleteTree(this->tree_root);
178278

179-
while (enc.get_position() <= enc.get_size() * 8u) {
180-
this->decode(this->tree_root, enc, out);
181-
} util::Logger::WriteLn("", false);
279+
while(this->read_huffman_dict_header(enc, dseq_len, dbit_len)) { // While header is followed by sequence
280+
while (dseq_len--) { // For each element, read {key, val}
281+
this->dict[T(enc.get(algo::Huffman<>::KEY_BITS))] = Codeword { enc.get(dbit_len), dbit_len };
282+
// TODO Add element to tree
283+
}
284+
}
182285

183-
out.set_position(0);
184-
for (size_t i = 0; i < out.get_size(); i++) {
185-
util::Logger::Write(std::string_format("%X", out.get_buffer()[i]), false);
186-
} util::Logger::WriteLn("", false);
286+
// util::BitStreamWriter out(length/8);
287+
288+
// while (enc.get_position() <= enc.get_size() * 8u) {
289+
// this->decode(this->tree_root, enc, out);
290+
// } util::Logger::WriteLn("", false);
291+
292+
// out.set_position(0);
293+
// for (size_t i = 0; i < out.get_size(); i++) {
294+
// util::Logger::Write(std::string_format("%X", out.get_buffer()[i]), false);
295+
// } util::Logger::WriteLn("", false);
296+
297+
// util::Logger::WriteLn("", false);
298+
// this->printTree();
299+
// util::Logger::WriteLn("", false);
187300

188-
util::Logger::WriteLn("", false);
189-
this->printTree();
190301
util::Logger::WriteLn("", false);
191302

303+
this->printDict();
304+
192305
return writer;
193306
}
194307

@@ -203,12 +316,16 @@ util::BitStreamWriter* algo::Huffman<T>::encode(util::BitStreamReader& reader) {
203316
template<class T>
204317
util::BitStreamWriter* algo::Huffman<T>::decode(util::BitStreamReader& reader) {
205318
const size_t table_size = reader.get(algo::Huffman<>::KEY_BITS); ///< Get table size
206-
const size_t entry_bits = reader.get(algo::Huffman<>::SIZE_BITS); ///< Get entry bit length
207319
const size_t data_bits = reader.get_size() * 8u; ///< Amount of data bits
208320

209-
for (size_t i = 0; i < table_size; i++) {
210-
this->dict[T(reader.get(algo::Huffman<>::KEY_BITS))] = Codeword { reader.get(entry_bits), 0u };
211-
}
321+
322+
// TODO if first bit is zero => no Huffman table => do nothing, just pass the stream back
323+
// use internal flag to enable Huffman, if disabled, write 1 zero to stream before data,
324+
// and later just call huffman.decode() (see TODO this TODO)
325+
326+
// for (size_t i = 0; i < table_size; i++) {
327+
// this->dict[T(reader.get(algo::Huffman<>::KEY_BITS))] = Codeword { reader.get(entry_bits), 0u };
328+
// }
212329

213330

214331
// TODO Create tree from dict

Huffman.hpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,10 @@ namespace algo {
5858
/**
5959
* Data struct for Huffman dictionary entries.
6060
*/
61-
typedef struct {
61+
struct Codeword {
6262
uint32_t word;
6363
uint32_t len;
64-
} Codeword;
64+
};
6565

6666
/**
6767
* @brief Huffman class
@@ -73,7 +73,10 @@ namespace algo {
7373

7474
std::unordered_map<T, Codeword> dict;
7575

76-
size_t buildDict(const algo::Node<> * const, std::vector<bool>);
76+
void add_huffman_dict_header(uint32_t, uint32_t, util::BitStreamWriter&);
77+
bool read_huffman_dict_header(util::BitStreamReader&, uint32_t&, uint32_t&);
78+
79+
void buildDict(const algo::Node<> * const, std::vector<bool>);
7780
void decode(const algo::Node<> * const, util::BitStreamReader&, util::BitStreamWriter&);
7881

7982
void deleteTree(algo::Node<>*);
@@ -89,8 +92,11 @@ namespace algo {
8992
void printDict(void);
9093
void printTree(void);
9194

92-
static constexpr size_t KEY_BITS = util::size_of<T>();
93-
static constexpr size_t SIZE_BITS = util::size_of<T>();
95+
static constexpr size_t KEY_BITS = util::size_of<T>(); ///< Bit length for keys in Huffman dict
96+
97+
static constexpr size_t DICT_HDR_HAS_ITEMS_BITS = 1u; ///< Whether there are dictionary items following (bit length)
98+
static constexpr size_t DICT_HDR_SEQ_LENGTH_BITS = 7u; ///< Amunt of bits to represent the length of following items
99+
static constexpr size_t DICT_HDR_ITEM_BITS = 4u; ///< Amunt of bits to represent the length of following items
94100
};
95101

96102
extern template class Node<uint8_t>;

main.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,6 @@
2323
#endif // 1
2424

2525
//#define LOG_OFF ///< Force logging off
26-
#define LOG_LOCAL ///< Enable Block-level logging (a lot of overhead, use sparingly)
26+
//#define LOG_LOCAL ///< Enable Block-level logging (a lot of overhead, use sparingly)
2727

2828
#endif // MAIN_HPP

0 commit comments

Comments
 (0)