Skip to content

Commit c5a1eb4

Browse files
Merge pull request #18 from aiven/hash-encoding
Encodes hashes hexadecimally instead of Base64
2 parents 747b4d6 + b4af4ab commit c5a1eb4

File tree

5 files changed

+892
-25
lines changed

5 files changed

+892
-25
lines changed

src/main/java/io/aiven/kafka/connect/transforms/Hash.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
import java.security.MessageDigest;
2020
import java.security.NoSuchAlgorithmException;
21-
import java.util.Base64;
2221
import java.util.Map;
2322
import java.util.Optional;
2423

@@ -32,6 +31,8 @@
3231
import org.apache.kafka.connect.errors.DataException;
3332
import org.apache.kafka.connect.transforms.Transformation;
3433

34+
import io.aiven.kafka.connect.transforms.utils.Hex;
35+
3536
import org.slf4j.Logger;
3637
import org.slf4j.LoggerFactory;
3738

@@ -178,8 +179,9 @@ private Optional<Object> getNewValueWithoutFieldName(final String recordStr,
178179
}
179180

180181
private String hashString(final String string) {
182+
// We don't call reset() here because digest() does resetting afterwards.
181183
final byte[] digest = messageDigest.digest(string.getBytes());
182-
return Base64.getEncoder().encodeToString(digest);
184+
return Hex.encode(digest);
183185
}
184186

185187
public static class Key<R extends ConnectRecord<R>> extends Hash<R> {
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
* Copyright 2020 Aiven Oy
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package io.aiven.kafka.connect.transforms.utils;
18+
19+
public class Hex {
20+
private static final char[] HEX_ARRAY = "0123456789abcdef".toCharArray();
21+
22+
/**
23+
* Encodes a byte array as a hexadecimal string.
24+
* @implNote https://stackoverflow.com/a/9855338/1781549
25+
*/
26+
public static String encode(final byte[] bytes) {
27+
final char[] hexChars = new char[bytes.length * 2];
28+
for (int j = 0; j < bytes.length; j++) {
29+
final int v = bytes[j] & 0xFF;
30+
hexChars[j * 2] = HEX_ARRAY[v >>> 4]; // hi nibble
31+
hexChars[j * 2 + 1] = HEX_ARRAY[v & 0x0F]; // lo nibble
32+
}
33+
return new String(hexChars);
34+
}
35+
}

src/test/java/io/aiven/kafka/connect/transforms/HashTest.java

Lines changed: 41 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,6 @@
1616

1717
package io.aiven.kafka.connect.transforms;
1818

19-
import java.security.MessageDigest;
20-
import java.security.NoSuchAlgorithmException;
21-
import java.util.Base64;
2219
import java.util.HashMap;
2320
import java.util.Map;
2421

@@ -40,7 +37,32 @@ abstract class HashTest {
4037

4138
private static final String FIELD = "email";
4239
private static final String EMPTY_FIELD_VALUE = "";
43-
private static final String NON_EMPTY_FIELD_VALUE = "jerry@all_your_bases.com";
40+
private static final String NON_EMPTY_FIELD_VALUE = "[email protected]";
41+
42+
private static final Map<String, Map<String, String>> HASHED_VALUES = new HashMap<>();
43+
44+
static {
45+
HASHED_VALUES.put("md5", new HashMap<>());
46+
// echo -n "" | md5sum -t
47+
HASHED_VALUES.get("md5").put(EMPTY_FIELD_VALUE, "d41d8cd98f00b204e9800998ecf8427e");
48+
// echo -n "[email protected]" | md5sum -t
49+
HASHED_VALUES.get("md5").put(NON_EMPTY_FIELD_VALUE, "10e5756d5d4c9c1cadd5e1b952071378");
50+
51+
HASHED_VALUES.put("sha1", new HashMap<>());
52+
// echo -n "" | sha1sum -t
53+
HASHED_VALUES.get("sha1").put(EMPTY_FIELD_VALUE, "da39a3ee5e6b4b0d3255bfef95601890afd80709");
54+
// echo -n "[email protected]" | sha1sum -t
55+
HASHED_VALUES.get("sha1").put(NON_EMPTY_FIELD_VALUE, "dd9ab6e93603bf618db0894a82da64f1623a94b6");
56+
57+
HASHED_VALUES.put("sha256", new HashMap<>());
58+
// echo -n "" | sha256sum -t
59+
HASHED_VALUES.get("sha256").put(EMPTY_FIELD_VALUE,
60+
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855");
61+
// echo -n "[email protected]" | sha256sum -t
62+
HASHED_VALUES.get("sha256").put(NON_EMPTY_FIELD_VALUE,
63+
"20e85b05e7349963fc64746fbc7f3f4fdf31507921360847ebef333b229cf2d6");
64+
}
65+
4466
private static final String DEFAULT_HASH_FUNCTION = HashConfig.HashFunction.SHA256.toString();
4567
private static final String UNAFFECTED_FIELD = "name";
4668
private static final String UNAFFECTED_FIELD_VALUE = "jerry";
@@ -232,6 +254,20 @@ void fieldName_EmptyStringValue(final String hashFunction) {
232254
assertEquals(setNewValue(originalRecord, newValue), result);
233255
}
234256

257+
@ParameterizedTest
258+
@ValueSource(strings = {"md5", "sha1", "sha256"})
259+
void sameValueSameHash(final String hashFunction) {
260+
final Schema schema = SchemaBuilder.STRING_SCHEMA;
261+
final Hash<SinkRecord> transform = transformation(null, false, hashFunction);
262+
263+
for (int i = 0; i < 10; i++) {
264+
final SinkRecord originalRecord = record(schema, NON_EMPTY_FIELD_VALUE);
265+
final SinkRecord result = transform.apply(originalRecord);
266+
final String newValue = hash(hashFunction, NON_EMPTY_FIELD_VALUE);
267+
assertEquals(setNewValue(originalRecord, newValue), result);
268+
}
269+
}
270+
235271
private Hash<SinkRecord> transformation(
236272
final String fieldName,
237273
final boolean skipMissingOrNull,
@@ -277,24 +313,6 @@ private SinkRecord setNewValue(final SinkRecord record, final Object newValue) {
277313
}
278314

279315
private String hash(final String function, final String value) {
280-
try {
281-
final MessageDigest md;
282-
switch (function) {
283-
case "md5":
284-
md = MessageDigest.getInstance("MD5");
285-
break;
286-
case "sha1":
287-
md = MessageDigest.getInstance("SHA1");
288-
break;
289-
case "sha256":
290-
md = MessageDigest.getInstance("SHA-256");
291-
break;
292-
default:
293-
throw new IllegalArgumentException(function);
294-
}
295-
return Base64.getEncoder().encodeToString(md.digest(value.getBytes()));
296-
} catch (final NoSuchAlgorithmException e) {
297-
throw new RuntimeException(e);
298-
}
316+
return HASHED_VALUES.get(function).get(value);
299317
}
300318
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/*
2+
* Copyright 2020 Aiven Oy
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package io.aiven.kafka.connect.transforms.utils;
18+
19+
import java.io.IOException;
20+
import java.net.URISyntaxException;
21+
import java.net.URL;
22+
import java.nio.charset.Charset;
23+
import java.nio.file.Files;
24+
import java.nio.file.Paths;
25+
import java.util.List;
26+
27+
import org.junit.jupiter.api.Test;
28+
29+
import static org.junit.jupiter.api.Assertions.assertEquals;
30+
31+
public class HexTest {
32+
@Test
33+
void testEncodeEmpty() {
34+
final byte[] bytes = new byte[0];
35+
assertEquals("", Hex.encode(bytes));
36+
}
37+
38+
@Test
39+
void testEncodeSingleByte() {
40+
final byte[] bytes = new byte[1];
41+
for (int i = 0; i < 256; i++) {
42+
final byte b = (byte) i;
43+
bytes[0] = b;
44+
assertEquals(String.format("%02x", b), Hex.encode(bytes));
45+
}
46+
}
47+
48+
@Test
49+
void testEncodeFromStrings() throws IOException, URISyntaxException {
50+
final URL resource = getClass().getClassLoader().getResource("blns.txt");
51+
final List<String> strings = Files.readAllLines(Paths.get(resource.toURI()));
52+
for (final String s : strings) {
53+
// Use the string as a byte array and hex-encode it.
54+
final byte[] bytes = s.getBytes(Charset.defaultCharset());
55+
final String encoded = Hex.encode(bytes);
56+
assertEquals(bytes.length * 2, encoded.length());
57+
58+
// Decode the string back and compare to the original.
59+
final char[] encodedChars = encoded.toCharArray();
60+
final byte[] decodedBytes = new byte[bytes.length];
61+
for (int i = 0; i < encoded.length(); i += 2) {
62+
final String s1 = new String(encodedChars, i, 2);
63+
decodedBytes[i / 2] = (byte) Integer.parseInt(s1, 16);
64+
}
65+
assertEquals(new String(decodedBytes, Charset.defaultCharset()), s);
66+
}
67+
}
68+
}

0 commit comments

Comments
 (0)