Skip to content

Commit f1382b6

Browse files
Implement Huffman encoding
1 parent 8d259f3 commit f1382b6

File tree

1 file changed

+81
-6
lines changed
  • app/src/main/java/org/vonderheidt/hips/utils

1 file changed

+81
-6
lines changed

app/src/main/java/org/vonderheidt/hips/utils/Huffman.kt

Lines changed: 81 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,89 @@ object Huffman {
1010
/**
1111
* Function to encode (the encrypted binary representation of) the secret message into a cover text using Huffman encoding.
1212
*
13-
* Corresponds to Stegasuras method `encode_huffman` in `huffman_baseline.py`.
13+
* Corresponds to Stegasuras method `encode_huffman` in `huffman_baseline.py`. Parameter `finish_sent` was removed (<=> is now hard coded to true).
14+
*
15+
* @param context The context to encode the secret message with.
16+
* @param cipherBits The encrypted binary representation of the secret message.
17+
* @return A cover text containing the secret message.
1418
*/
15-
suspend fun encode(context: String, cipherBits: ByteArray, bitsPerToken: Int = Settings.bitsPerToken): String {
16-
// Wait 5 seconds
17-
delay(5000)
19+
fun encode(context: String, cipherBits: ByteArray): String {
20+
// Tokenize context
21+
val contextTokens = LlamaCpp.tokenize(context)
1822

19-
// Return placeholder
20-
val coverText = ""
23+
// Convert cipher bits to bit string
24+
val cipherBitString = Format.asBitString(cipherBits)
25+
26+
// Initialize array to store cover text tokens
27+
var coverTextTokens = intArrayOf()
28+
29+
// Initialize variables and flags for loop
30+
var i = 0
31+
var isLastSentenceFinished = false
32+
33+
var isFirstRun = true // llama.cpp batch needs to store context tokens in first run, but only last sampled token in subsequent runs
34+
var sampledToken = -1 // Will always be overwritten with last cover text token
35+
36+
// Sample tokens until all of bits of secret message are encoded and last sentence is finished
37+
while (i < cipherBitString.length || !isLastSentenceFinished) {
38+
// Huffman sampling to encode bits of secret message into tokens
39+
if (i < cipherBitString.length) {
40+
// Call llama.cpp to calculate the logit matrix similar to https://github.com/ggerganov/llama.cpp/blob/master/examples/simple/simple.cpp:
41+
// Needs only next tokens to be processed to store in a batch, i.e. contextTokens in first run and last sampled token in subsequent runs, rest is managed internally in ctx
42+
// Only last row of logit matrix is needed as it contains logits corresponding to last token of the prompt
43+
val logits = LlamaCpp.getLogits(if (isFirstRun) contextTokens else intArrayOf(sampledToken)).last()
44+
45+
// Get top 2^bitsPerToken logits for last token of prompt (= height of Huffman tree)
46+
val topLogits = getTopLogits(logits)
47+
48+
// Construct Huffman tree from top logits
49+
val huffmanCoding = HuffmanCoding()
50+
huffmanCoding.buildHuffmanTree(topLogits)
51+
huffmanCoding.mergeHuffmanNodes()
52+
val root = huffmanCoding.generateHuffmanCodes()
53+
54+
// Traverse Huffman tree based on bits of secret message to sample next token, therefore encoding information in it
55+
var currentNode = root
56+
57+
// First nodes won't have a token as they were created during the merge step
58+
while (currentNode.token == null) {
59+
// First condition is needed in case (length of cipher bits) % (bits per token) != 0
60+
// In last loop of outer while, inner while can cause i to exceed cipherBitString.length
61+
// Second condition is only checked if first condition is false, so IndexOutOfBoundsException can't happen
62+
if (i >= cipherBitString.length || cipherBitString[i] == '0') {
63+
// Asserting left and right child nodes to be not null is safe as Huffman tree isn't traversed further down than bitsPerToken levels
64+
currentNode = currentNode.left!!
65+
}
66+
else {
67+
currentNode = currentNode.right!!
68+
}
69+
70+
// Every time a turn is made when traversing the Huffman tree, another bit is encoded
71+
i++
72+
}
73+
74+
// Token containing the right bitsPerToken bits of information in its path is now found
75+
sampledToken = currentNode.token!!
76+
77+
// Update flag
78+
isFirstRun = false
79+
}
80+
// Greedy sampling to pick most likely token until last sentence is finished
81+
else {
82+
// llama.cpp greedy sampler is used for efficiency instead of manually sorting logits descending and picking the first one
83+
// Input is only last sampled token similar to else case of getLogits input above, as greedy sampling only over gets called after Huffman sampling
84+
sampledToken = LlamaCpp.sample(sampledToken)
85+
86+
// Update flag
87+
isLastSentenceFinished = LlamaCpp.isEndOfSentence(sampledToken)
88+
}
89+
90+
// Append last sampled token to cover text tokens
91+
coverTextTokens += sampledToken
92+
}
93+
94+
// Detokenize cover text tokens into cover text to return it
95+
val coverText = LlamaCpp.detokenize(coverTextTokens)
2196

2297
return coverText
2398
}

0 commit comments

Comments
 (0)