diff --git a/.classpath b/.classpath deleted file mode 100644 index 534b5e5..0000000 --- a/.classpath +++ /dev/null @@ -1,36 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/.gitignore b/.gitignore index e43e9a3..4359056 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,10 @@ *.war *.ear +.classpath +.settings +.project + # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* /target diff --git a/.project b/.project deleted file mode 100644 index 1202499..0000000 --- a/.project +++ /dev/null @@ -1,23 +0,0 @@ - - - encoding-test - - - - - - org.eclipse.jdt.core.javabuilder - - - - - org.eclipse.m2e.core.maven2Builder - - - - - - org.eclipse.jdt.core.javanature - org.eclipse.m2e.core.maven2Nature - - diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs deleted file mode 100644 index 60105c1..0000000 --- a/.settings/org.eclipse.jdt.core.prefs +++ /dev/null @@ -1,5 +0,0 @@ -eclipse.preferences.version=1 -org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 -org.eclipse.jdt.core.compiler.compliance=1.6 -org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning -org.eclipse.jdt.core.compiler.source=1.6 diff --git a/.settings/org.eclipse.m2e.core.prefs b/.settings/org.eclipse.m2e.core.prefs deleted file mode 100644 index f897a7f..0000000 --- a/.settings/org.eclipse.m2e.core.prefs +++ /dev/null @@ -1,4 +0,0 @@ -activeProfiles= -eclipse.preferences.version=1 -resolveWorkspaceProjects=true -version=1 diff --git a/README.md b/README.md index 83697aa..8e73b04 100644 --- a/README.md +++ b/README.md @@ -5,119 +5,212 @@ Testing encoding size when forced to go through a UTF string (i.e. bytes -> UTF Output: ``` -long direct: 800 -UTF-8 bytes of .toString(): 1900 -UTF-8 bytes of Base:16: 1600 -UTF-8 bytes of Base:17: 1600 -UTF-8 bytes of Base:18: 1600 -UTF-8 bytes of Base:19: 1500 -UTF-8 bytes of Base:20: 1500 -UTF-8 bytes of Base:21: 1500 -UTF-8 bytes of Base:22: 1500 -UTF-8 bytes of Base:23: 1400 -UTF-8 bytes of Base:24: 1400 -UTF-8 bytes of Base:25: 1400 -UTF-8 bytes of Base:26: 1400 -UTF-8 bytes of Base:27: 1400 -UTF-8 bytes of Base:28: 1400 -UTF-8 bytes of Base:29: 1300 -UTF-8 bytes of Base:30: 1300 -UTF-8 bytes of Base:31: 1300 -UTF-8 bytes of Base:32: 1300 -UTF-8 bytes of Base:33: 1300 -UTF-8 bytes of Base:34: 1300 -UTF-8 bytes of Base:35: 1300 -UTF-8 bytes of Base:36: 1300 -UTF-8 bytes of Base:37: 1300 -UTF-8 bytes of Base:38: 1300 -UTF-8 bytes of Base:39: 1200 -UTF-8 bytes of Base:40: 1200 -UTF-8 bytes of Base:41: 1200 -UTF-8 bytes of Base:42: 1200 -UTF-8 bytes of Base:43: 1200 -UTF-8 bytes of Base:44: 1200 -UTF-8 bytes of Base:45: 1200 -UTF-8 bytes of Base:46: 1200 -UTF-8 bytes of Base:47: 1200 -UTF-8 bytes of Base:48: 1200 -UTF-8 bytes of Base:49: 1200 -UTF-8 bytes of Base:50: 1200 -UTF-8 bytes of Base:51: 1200 -UTF-8 bytes of Base:52: 1200 -UTF-8 bytes of Base:53: 1100 -UTF-8 bytes of Base:54: 1100 -UTF-8 bytes of Base:55: 1100 -UTF-8 bytes of Base:56: 1100 -UTF-8 bytes of Base:57: 1100 -UTF-8 bytes of Base:58: 1100 -UTF-8 bytes of Base:59: 1100 -UTF-8 bytes of Base:60: 1100 -UTF-8 bytes of Base:61: 1100 -UTF-8 bytes of Base:62: 1100 -UTF-8 bytes of Base:63: 1100 -UTF-8 bytes of Base:64: 1100 -UTF-8 bytes of Base:65: 1100 -UTF-8 bytes of Base:66: 1100 -UTF-8 bytes of Base:67: 1100 -UTF-8 bytes of Base:68: 1100 -UTF-8 bytes of Base:69: 1100 -UTF-8 bytes of Base:70: 1100 -UTF-8 bytes of Base:71: 1100 -UTF-8 bytes of Base:72: 1100 -UTF-8 bytes of Base:73: 1100 -UTF-8 bytes of Base:74: 1100 -UTF-8 bytes of Base:75: 1100 -UTF-8 bytes of Base:76: 1100 -UTF-8 bytes of Base:77: 1100 -UTF-8 bytes of Base:78: 1100 -UTF-8 bytes of Base:79: 1000 -UTF-8 bytes of Base:80: 1000 -UTF-8 bytes of Base:81: 1000 -UTF-8 bytes of Base:82: 1000 -UTF-8 bytes of Base:83: 1000 -UTF-8 bytes of Base:84: 1000 -UTF-8 bytes of Base:85: 1000 -UTF-8 bytes of Base:86: 1000 -UTF-8 bytes of Base:87: 1000 -UTF-8 bytes of Base:88: 1000 -UTF-8 bytes of Base:89: 1000 -UTF-8 bytes of Base:90: 1000 -UTF-8 bytes of Base:91: 1000 -UTF-8 bytes of Base:92: 1000 -UTF-8 bytes of Base:93: 1000 -UTF-8 bytes of Base:94: 1000 -UTF-8 bytes of Base:95: 1000 -UTF-8 bytes of Base:96: 1000 -UTF-8 bytes of Base:97: 1000 -UTF-8 bytes of Base:98: 1000 -UTF-8 bytes of Base:99: 1000 -UTF-8 bytes of Base:100: 1000 -UTF-8 bytes of Base:101: 1000 -UTF-8 bytes of Base:102: 1000 -UTF-8 bytes of Base:103: 1000 -UTF-8 bytes of Base:104: 1000 -UTF-8 bytes of Base:105: 1000 -UTF-8 bytes of Base:106: 1000 -UTF-8 bytes of Base:107: 1000 -UTF-8 bytes of Base:108: 1000 -UTF-8 bytes of Base:109: 1000 -UTF-8 bytes of Base:110: 1000 -UTF-8 bytes of Base:111: 1000 -UTF-8 bytes of Base:112: 1000 -UTF-8 bytes of Base:113: 1000 -UTF-8 bytes of Base:114: 1000 -UTF-8 bytes of Base:115: 1000 -UTF-8 bytes of Base:116: 1000 -UTF-8 bytes of Base:117: 1000 -UTF-8 bytes of Base:118: 1000 -UTF-8 bytes of Base:119: 1000 -UTF-8 bytes of Base:120: 1000 -UTF-8 bytes of Base:121: 1000 -UTF-8 bytes of Base:122: 1000 -UTF-8 bytes of Base:123: 1000 -UTF-8 bytes of Base:124: 1000 -UTF-8 bytes of Base:125: 1000 -UTF-8 bytes of Base:126: 1000 -UTF-8 bytes of Base:127: 1000 -UTF-8 bytes of Base:128: 900 +Encoding: Bennight Base 2 +Average Encoded Bytes (2 original bytes): 17.0 +Average Encoded Bytes (3 original bytes): 25.0 +Average Encoded Bytes (4 original bytes): 33.0 +Average Encoded Bytes (5 original bytes): 41.0 +Average Encoded Bytes (6 original bytes): 49.0 +Average Encoded Bytes (7 original bytes): 57.0 +Average Encoded Bytes (8 original bytes): 65.0 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:08:44.604 +Time in decode: 0:23:39.100 +*************************** + + +Encoding: Bennight Base 4 +Average Encoded Bytes (2 original bytes): 9.0 +Average Encoded Bytes (3 original bytes): 13.0 +Average Encoded Bytes (4 original bytes): 17.0 +Average Encoded Bytes (5 original bytes): 21.0 +Average Encoded Bytes (6 original bytes): 25.0 +Average Encoded Bytes (7 original bytes): 29.0 +Average Encoded Bytes (8 original bytes): 33.0 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:04:31.915 +Time in decode: 0:10:23.793 +*************************** + + +Encoding: Bennight Base 8 +Average Encoded Bytes (2 original bytes): 6.0 +Average Encoded Bytes (3 original bytes): 9.0 +Average Encoded Bytes (4 original bytes): 11.0 +Average Encoded Bytes (5 original bytes): 14.0 +Average Encoded Bytes (6 original bytes): 17.0 +Average Encoded Bytes (7 original bytes): 19.0 +Average Encoded Bytes (8 original bytes): 22.0 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:03:05.284 +Time in decode: 0:06:25.902 +*************************** + + +Encoding: Bennight Base 16 +Average Encoded Bytes (2 original bytes): 5.0 +Average Encoded Bytes (3 original bytes): 7.0 +Average Encoded Bytes (4 original bytes): 9.0 +Average Encoded Bytes (5 original bytes): 11.0 +Average Encoded Bytes (6 original bytes): 13.0 +Average Encoded Bytes (7 original bytes): 15.0 +Average Encoded Bytes (8 original bytes): 17.0 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:02:28.658 +Time in decode: 0:04:47.536 +*************************** + + +Encoding: Bennight Base 32 +Average Encoded Bytes (2 original bytes): 4.0 +Average Encoded Bytes (3 original bytes): 5.0 +Average Encoded Bytes (4 original bytes): 7.0 +Average Encoded Bytes (5 original bytes): 9.0 +Average Encoded Bytes (6 original bytes): 10.0 +Average Encoded Bytes (7 original bytes): 12.0 +Average Encoded Bytes (8 original bytes): 13.0 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:02:00.650 +Time in decode: 0:03:49.458 +*************************** + + +Encoding: Bennight Base 64 +Average Encoded Bytes (2 original bytes): 3.0 +Average Encoded Bytes (3 original bytes): 5.0 +Average Encoded Bytes (4 original bytes): 6.0 +Average Encoded Bytes (5 original bytes): 7.0 +Average Encoded Bytes (6 original bytes): 9.0 +Average Encoded Bytes (7 original bytes): 10.0 +Average Encoded Bytes (8 original bytes): 11.0 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:01:42.823 +Time in decode: 0:03:08.459 +*************************** + + +Encoding: Bennight Base 85 +Average Encoded Bytes (2 original bytes): 3.0 +Average Encoded Bytes (3 original bytes): 4.0 +Average Encoded Bytes (4 original bytes): 5.9614105224609375 +Average Encoded Bytes (5 original bytes): 7.0 +Average Encoded Bytes (6 original bytes): 8.0 +Average Encoded Bytes (7 original bytes): 9.0 +Average Encoded Bytes (8 original bytes): 10.75 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:01:39.455 +Time in decode: 0:03:08.377 +*************************** + + +Encoding: Bennight Base 128 +Average Encoded Bytes (2 original bytes): 3.0 +Average Encoded Bytes (3 original bytes): 4.0 +Average Encoded Bytes (4 original bytes): 5.0 +Average Encoded Bytes (5 original bytes): 6.0 +Average Encoded Bytes (6 original bytes): 7.0 +Average Encoded Bytes (7 original bytes): 9.0 +Average Encoded Bytes (8 original bytes): 10.0 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:01:27.609 +Time in decode: 0:02:32.070 +*************************** + + +Encoding: Guava Base 16 +Average Encoded Bytes (2 original bytes): 4.0 +Average Encoded Bytes (3 original bytes): 6.0 +Average Encoded Bytes (4 original bytes): 8.0 +Average Encoded Bytes (5 original bytes): 10.0 +Average Encoded Bytes (6 original bytes): 12.0 +Average Encoded Bytes (7 original bytes): 14.0 +Average Encoded Bytes (8 original bytes): 16.0 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:00:28.147 +Time in decode: 0:00:38.829 +*************************** + + +Encoding: Guava Base 32 +Average Encoded Bytes (2 original bytes): 8.0 +Average Encoded Bytes (3 original bytes): 8.0 +Average Encoded Bytes (4 original bytes): 8.0 +Average Encoded Bytes (5 original bytes): 8.0 +Average Encoded Bytes (6 original bytes): 16.0 +Average Encoded Bytes (7 original bytes): 16.0 +Average Encoded Bytes (8 original bytes): 16.0 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:00:23.337 +Time in decode: 0:00:55.243 +*************************** + + +Encoding: Guava Base 64 +Average Encoded Bytes (2 original bytes): 4.0 +Average Encoded Bytes (3 original bytes): 4.0 +Average Encoded Bytes (4 original bytes): 8.0 +Average Encoded Bytes (5 original bytes): 8.0 +Average Encoded Bytes (6 original bytes): 8.0 +Average Encoded Bytes (7 original bytes): 12.0 +Average Encoded Bytes (8 original bytes): 12.0 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:00:21.299 +Time in decode: 0:00:47.335 +*************************** + + +Encoding: HBase Base 64 +Average Encoded Bytes (2 original bytes): 4.0 +Average Encoded Bytes (3 original bytes): 4.0 +Average Encoded Bytes (4 original bytes): 8.0 +Average Encoded Bytes (5 original bytes): 8.0 +Average Encoded Bytes (6 original bytes): 8.0 +Average Encoded Bytes (7 original bytes): 12.0 +Average Encoded Bytes (8 original bytes): 12.0 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:01:04.723 +Time in decode: 0:05:30.249 +*************************** + + +Encoding: Base 91 +Average Encoded Bytes (2 original bytes): 3.0 +Average Encoded Bytes (3 original bytes): 4.0 +Average Encoded Bytes (4 original bytes): 5.0 +Average Encoded Bytes (5 original bytes): 6.7965087890625 +Average Encoded Bytes (6 original bytes): 7.97198486328125 +Average Encoded Bytes (7 original bytes): 8.99853515625 +Average Encoded Bytes (8 original bytes): 10.0 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:00:56.579 +Time in decode: 0:00:59.654 +*************************** + + +Encoding: Base 128 +Average Encoded Bytes (2 original bytes): 3.0 +Average Encoded Bytes (3 original bytes): 4.0 +Average Encoded Bytes (4 original bytes): 5.0 +Average Encoded Bytes (5 original bytes): 6.0 +Average Encoded Bytes (6 original bytes): 7.0 +Average Encoded Bytes (7 original bytes): 8.0 +Average Encoded Bytes (8 original bytes): 10.0 +Total Successes: 71434240 +Success Rate: 1.0 +Time in encode: 0:00:35.485 +Time in decode: 0:00:36.025 +*************************** ``` diff --git a/pom.xml b/pom.xml index 86a582d..e35a18d 100644 --- a/pom.xml +++ b/pom.xml @@ -10,6 +10,28 @@ guava 18.0 + + com.googlecode.combinatoricslib + combinatoricslib + 2.1 + + + org.apache.hbase + hbase + 0.90.2 + + + org.jruby + jruby-complete + + + + + junit + junit + 4.11 + test + @@ -18,8 +40,8 @@ maven-compiler-plugin 3.1 - 1.6 - 1.6 + 1.7 + 1.7 diff --git a/src/main/java/com/bennight/BaseNEncoder.java b/src/main/java/com/bennight/BaseNEncoder.java index d1ea712..6dda8fd 100644 --- a/src/main/java/com/bennight/BaseNEncoder.java +++ b/src/main/java/com/bennight/BaseNEncoder.java @@ -4,44 +4,58 @@ import java.util.HashMap; import java.util.Map; -public class BaseNEncoder { - - +import org.apache.commons.lang.ArrayUtils; + +import com.fecher.BinaryEncoding; + +public class BaseNEncoder implements + BinaryEncoding +{ + private final char[] _dictionary; private final Map _reverseDictionary; private final BigInteger _base; - - - public BaseNEncoder(char[] dictionary){ + + public BaseNEncoder( + final char[] dictionary ) { _dictionary = dictionary; _base = BigInteger.valueOf(dictionary.length); _reverseDictionary = reverseDictionary(dictionary); } - - public BaseNEncoder(int base) throws Exception{ - if (base > 128){ - throw new Exception("Only handles mapping to ASCII char set, base must be <= 128"); - } + + public BaseNEncoder( + final int base ) { + // if (base > 128) { + // throw new Exception( + // "Only handles mapping to ASCII char set, base must be <= 128"); + // } _dictionary = new char[base]; - for (int x = 0; x < base; x++){ - _dictionary[x] = (char)x; + for (int x = 0; x < base; x++) { + _dictionary[x] = (char) x; } - + _base = BigInteger.valueOf(base); _reverseDictionary = reverseDictionary(_dictionary); } - - private static Map reverseDictionary(char[] dictionary){ - Map reverse = new HashMap(); - for (int i = 0; i < dictionary.length; i++){ - reverse.put(dictionary[i], i); + + private static Map reverseDictionary( + final char[] dictionary ) { + final Map reverse = new HashMap(); + for (int i = 0; i < dictionary.length; i++) { + reverse.put( + dictionary[i], + i); } return reverse; } - - public String encode(BigInteger value){ - StringBuilder s = new StringBuilder(); - BigInteger[] parts = { value, BigInteger.ZERO }; + + public String encode( + final BigInteger value ) { + final StringBuilder s = new StringBuilder(); + BigInteger[] parts = { + value, + BigInteger.ZERO + }; while (parts[0].compareTo(_base) >= 0) { parts = parts[0].divideAndRemainder(_base); s.append(_dictionary[parts[1].intValue()]); @@ -49,15 +63,43 @@ public String encode(BigInteger value){ s.append(_dictionary[parts[0].intValue()]); return s.reverse().toString(); } - - public BigInteger decode(String encoded) { + + public BigInteger decodeInt( + final String encoded ) { BigInteger result = BigInteger.ZERO; - int len = encoded.length(); + final int len = encoded.length(); for (int i = 0; i < len; i++) { - BigInteger digit = BigInteger.valueOf(_reverseDictionary.get(encoded.charAt(i))); + final BigInteger digit = BigInteger.valueOf(_reverseDictionary.get(encoded.charAt(i))); result = result.add(digit.multiply(_base.pow((len - i) - 1))); } - return result; + return new BigInteger( + result.toByteArray()); + } + + @Override + public String encode( + final byte[] binary ) { + return encode(new BigInteger( + ArrayUtils.addAll( + new byte[] { + 1 + }, + binary))); } + @Override + public byte[] decode( + final String str ) { + final byte[] paddedArray = decodeInt( + str).toByteArray(); + return ArrayUtils.subarray( + paddedArray, + 1, + paddedArray.length); + } + + @Override + public String getEncodingName() { + return "Bennight Base " + _base; + } } diff --git a/src/main/java/com/fecher/Base128Encoding.java b/src/main/java/com/fecher/Base128Encoding.java new file mode 100644 index 0000000..2d7a834 --- /dev/null +++ b/src/main/java/com/fecher/Base128Encoding.java @@ -0,0 +1,156 @@ +package com.fecher; + +import java.io.ByteArrayOutputStream; + +/** + * Modified version of Jochaim Henke's original code from + * http://base91.sourceforge.net/ + * + * basE91 encoding/decoding routines + * + * Copyright (c) 2000-2006 Joachim Henke All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * - Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. - Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials provided + * with the distribution. - Neither the name of Joachim Henke nor the names of + * his contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * @author Joachim Henke (Original version) + * @author Benedikt Waldvogel (Modifications) + * @author Rich Fecher (More Modifications) + */ +public class Base128Encoding implements + BinaryEncoding +{ + + public static final byte[] ENCODING_TABLE; + private static final byte[] DECODING_TABLE; + private static final int BASE; + private static final double AVERAGE_ENCODING_RATIO = 8 / 7; + + static { + final StringBuffer buf = new StringBuffer(); + for (int i = 0; i < 128; i++) { + buf.append((char) i); + } + ENCODING_TABLE = buf.toString().getBytes(); + BASE = ENCODING_TABLE.length; + + DECODING_TABLE = new byte[256]; + for (int i = 0; i < 256; ++i) { + DECODING_TABLE[i] = -1; + } + + for (int i = 0; i < BASE; ++i) { + DECODING_TABLE[ENCODING_TABLE[i]] = (byte) i; + } + } + + @Override + public String encode( + final byte[] data ) { + return new String( + encodeBytes(data)); + } + + @Override + public String getEncodingName() { + return "Base 128"; + } + + public static byte[] encodeBytes( + final byte[] data ) { + + final int estimatedSize = (int) Math.ceil(data.length * AVERAGE_ENCODING_RATIO); + final ByteArrayOutputStream output = new ByteArrayOutputStream( + estimatedSize); + + int ebq = 0; + int en = 0; + for (int i = 0; i < data.length; ++i) { + ebq |= (data[i] & 255) << en; + en += 8; + if (en > 13) { + int ev = ebq & 8191; + + ev = ebq & 16383; + ebq >>= 14; + en -= 14; + output.write(ENCODING_TABLE[ev % BASE]); + output.write(ENCODING_TABLE[ev / BASE]); + } + } + + if (en > 0) { + output.write(ENCODING_TABLE[ebq % BASE]); + if ((en > 7)) { + output.write(ENCODING_TABLE[ebq / BASE]); + } + } + + return output.toByteArray(); + } + + @Override + public byte[] decode( + final String str ) { + return decodeBytes(str.getBytes()); + } + + public static byte[] decodeBytes( + final byte[] data ) { + + int dbq = 0; + int dn = 0; + int dv = -1; + + final int estimatedSize = (int) Math.round(data.length / AVERAGE_ENCODING_RATIO); + final ByteArrayOutputStream output = new ByteArrayOutputStream( + estimatedSize); + + for (int i = 0; i < data.length; ++i) { + if (DECODING_TABLE[data[i]] == -1) { + continue; + } + if (dv == -1) { + dv = DECODING_TABLE[data[i]]; + } + else { + dv += DECODING_TABLE[data[i]] * BASE; + dbq |= dv << dn; + dn += 14; + do { + output.write((byte) dbq); + dbq >>= 8; + dn -= 8; + } + while (dn > 7); + dv = -1; + } + } + + if (dv != -1) { + output.write((byte) (dbq | (dv << dn))); + } + + return output.toByteArray(); + } +} diff --git a/src/main/java/com/fecher/Base91.java b/src/main/java/com/fecher/Base91.java new file mode 100644 index 0000000..1de7edf --- /dev/null +++ b/src/main/java/com/fecher/Base91.java @@ -0,0 +1,161 @@ +package com.fecher; + +import java.io.ByteArrayOutputStream; + +/** + * Modified version of Jochaim Henke's original code from + * http://base91.sourceforge.net/ + * + * basE91 encoding/decoding routines + * + * Copyright (c) 2000-2006 Joachim Henke All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * - Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. - Redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials provided + * with the distribution. - Neither the name of Joachim Henke nor the names of + * his contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * @author Joachim Henke (Original version) + * @author Benedikt Waldvogel (Modifications) + */ +public class Base91 implements + BinaryEncoding +{ + + public static final byte[] ENCODING_TABLE; + private static final byte[] DECODING_TABLE; + private static final int BASE; + private static final float AVERAGE_ENCODING_RATIO = 1.2297f; + + static { + final String ts = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!#$%&()*+,./:;<=>?@[]^_`{|}~\""; + ENCODING_TABLE = ts.getBytes(); + BASE = ENCODING_TABLE.length; + + DECODING_TABLE = new byte[256]; + for (int i = 0; i < 256; ++i) { + DECODING_TABLE[i] = -1; + } + + for (int i = 0; i < BASE; ++i) { + DECODING_TABLE[ENCODING_TABLE[i]] = (byte) i; + } + } + + @Override + public String encode( + final byte[] data ) { + return new String( + encodeBytes(data)); + } + + @Override + public String getEncodingName() { + return "Base 91"; + } + + @Override + public byte[] decode( + final String str ) { + return decodeBytes(str.getBytes()); + } + + public static byte[] encodeBytes( + final byte[] data ) { + + final int estimatedSize = (int) Math.ceil(data.length * AVERAGE_ENCODING_RATIO); + final ByteArrayOutputStream output = new ByteArrayOutputStream( + estimatedSize); + + int ebq = 0; + int en = 0; + for (int i = 0; i < data.length; ++i) { + ebq |= (data[i] & 255) << en; + en += 8; + if (en > 13) { + int ev = ebq & 8191; + + if (ev > 88) { + ebq >>= 13; + en -= 13; + } + else { + ev = ebq & 16383; + ebq >>= 14; + en -= 14; + } + output.write(ENCODING_TABLE[ev % BASE]); + output.write(ENCODING_TABLE[ev / BASE]); + } + } + + if (en > 0) { + output.write(ENCODING_TABLE[ebq % BASE]); + if ((en > 7) || (ebq > 90)) { + output.write(ENCODING_TABLE[ebq / BASE]); + } + } + + return output.toByteArray(); + } + + public static byte[] decodeBytes( + final byte[] data ) { + + // if (data.length == 0) + // return new byte[] {}; + + int dbq = 0; + int dn = 0; + int dv = -1; + + final int estimatedSize = Math.round(data.length / AVERAGE_ENCODING_RATIO); + final ByteArrayOutputStream output = new ByteArrayOutputStream( + estimatedSize); + + for (int i = 0; i < data.length; ++i) { + if (DECODING_TABLE[data[i]] == -1) { + continue; + } + if (dv == -1) { + dv = DECODING_TABLE[data[i]]; + } + else { + dv += DECODING_TABLE[data[i]] * BASE; + dbq |= dv << dn; + dn += (dv & 8191) > 88 ? 13 : 14; + do { + output.write((byte) dbq); + dbq >>= 8; + dn -= 8; + } + while (dn > 7); + dv = -1; + } + } + + if (dv != -1) { + output.write((byte) (dbq | (dv << dn))); + } + + return output.toByteArray(); + } +} \ No newline at end of file diff --git a/src/main/java/com/fecher/BinaryEncoding.java b/src/main/java/com/fecher/BinaryEncoding.java new file mode 100644 index 0000000..c616326 --- /dev/null +++ b/src/main/java/com/fecher/BinaryEncoding.java @@ -0,0 +1,12 @@ +package com.fecher; + +public interface BinaryEncoding +{ + public String encode( + byte[] binary ); + + public byte[] decode( + String str ); + + public String getEncodingName(); +} diff --git a/src/main/java/com/fecher/GuavaBaseNEncoding.java b/src/main/java/com/fecher/GuavaBaseNEncoding.java new file mode 100644 index 0000000..e525717 --- /dev/null +++ b/src/main/java/com/fecher/GuavaBaseNEncoding.java @@ -0,0 +1,44 @@ +package com.fecher; + +import com.google.common.io.BaseEncoding; + +public class GuavaBaseNEncoding implements +BinaryEncoding +{ + private BaseEncoding guavaEncoding; + private final int n; + + public GuavaBaseNEncoding( + final int n ) { + switch (n) { + case 16: + guavaEncoding = BaseEncoding.base16(); + break; + case 32: + guavaEncoding = BaseEncoding.base32(); + break; + case 64: + guavaEncoding = BaseEncoding.base64(); + break; + } + this.n = n; + } + + @Override + public String encode( + final byte[] binary ) { + return guavaEncoding.encode(binary); + } + + @Override + public byte[] decode( + final String str ) { + return guavaEncoding.decode(str); + } + + @Override + public String getEncodingName() { + return "Guava Base " + n; + } + +} diff --git a/src/main/java/com/fecher/HBase64Encoding.java b/src/main/java/com/fecher/HBase64Encoding.java new file mode 100644 index 0000000..d58d77b --- /dev/null +++ b/src/main/java/com/fecher/HBase64Encoding.java @@ -0,0 +1,32 @@ +package com.fecher; + +import org.apache.hadoop.hbase.util.Base64; + +public class HBase64Encoding implements +BinaryEncoding +{ + + public HBase64Encoding() {} + + @Override + public String encode( + final byte[] binary ) { + return Base64.encodeBytes( + binary, + Base64.ORDERED); + } + + @Override + public byte[] decode( + final String str ) { + return Base64.decode( + str, + Base64.ORDERED); + } + + @Override + public String getEncodingName() { + return "HBase Base 64"; + } + +} diff --git a/src/test/java/com/fecher/EncodingTest.java b/src/test/java/com/fecher/EncodingTest.java new file mode 100644 index 0000000..8091044 --- /dev/null +++ b/src/test/java/com/fecher/EncodingTest.java @@ -0,0 +1,194 @@ +package com.fecher; + +import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang.time.StopWatch; +import org.junit.Test; +import org.paukov.combinatorics.Factory; +import org.paukov.combinatorics.ICombinatoricsVector; + +import com.bennight.BaseNEncoder; + +public class EncodingTest +{ + private final static Charset UTF_8 = Charset.forName("UTF-8"); + + private static Stats[] TEST_ENCODING = new Stats[] { + new Stats( + new BaseNEncoder( + 2)), + new Stats( + new BaseNEncoder( + 4)), + new Stats( + new BaseNEncoder( + 8)), + new Stats( + new BaseNEncoder( + 16)), + new Stats( + new BaseNEncoder( + 32)), + new Stats( + new BaseNEncoder( + 64)), + new Stats( + new BaseNEncoder( + 85)), + new Stats( + new BaseNEncoder( + 128)), + new Stats( + new GuavaBaseNEncoding( + 16)), + new Stats( + new GuavaBaseNEncoding( + 32)), + new Stats( + new GuavaBaseNEncoding( + 64)), + new Stats( + new HBase64Encoding()), + new Stats( + new Base91()), + new Stats( + new Base128Encoding()), + }; + + private static class Stats + { + private final Map totalBytes = new HashMap(); + // private long totalEncodings = 0; + private final Map totalEncodings = new HashMap(); + private final BinaryEncoding encoding; + private long totalSuccesses = 0; + private final StopWatch encodeTimer = new StopWatch(); + private final StopWatch decodeTimer = new StopWatch(); + + public Stats( + final BinaryEncoding encoding ) { + this.encoding = encoding; + encodeTimer.start(); + encodeTimer.suspend(); + + decodeTimer.start(); + decodeTimer.suspend(); + } + + @Override + public String toString() { + final StringBuilder str = new StringBuilder(); + str.append( + "Encoding: ").append( + encoding.getEncodingName()).append( + '\n'); + long total = 0; + for (final Entry e : totalEncodings.entrySet()) { + str.append( + // "Total Encodings (").append( + // e.getKey()).append( + // " original bytes): ").append( + // e.getValue()).append( + // '\n').append( + // "Total Encoded Bytes (").append( + // e.getKey()).append( + // " original bytes): ").append( + // totalBytes.get(e.getKey())).append( + // '\n').append( + "Average Encoded Bytes (").append( + e.getKey()).append( + " original bytes): ").append( + (double) totalBytes.get(e.getKey()) / (double) e.getValue()).append( + '\n'); + total += e.getValue(); + } + str.append( + "Total Successes: ").append( + totalSuccesses).append( + '\n').append( + "Success Rate: ").append( + (double) totalSuccesses / (double) total).append( + '\n').append( + "Time in encode: ").append( + encodeTimer.toString()).append( + '\n').append( + "Time in decode: ").append( + decodeTimer.toString()).append( + '\n').append( + "***************************\n\n"); + return str.toString(); + } + } + + public static void main( + final String[] args ) { + new EncodingTest().test(); + } + + @Test + public void test() { + + for (int k = 2; k <= 8; k++) { + for (final Stats stats : TEST_ENCODING) { + stats.totalBytes.put( + k, + 0L); + stats.totalEncodings.put( + k, + 0L); + } + final int divisor = (int) Math.pow( + 2, + k - 1); + final Byte[] bytesArray = new Byte[((Byte.MAX_VALUE - Byte.MIN_VALUE) + 1) / divisor]; + int i = 0; + for (int b = Byte.MIN_VALUE; b <= Byte.MAX_VALUE; b += divisor) { + bytesArray[i++] = (byte) b; + } + + final ICombinatoricsVector bytesVector = Factory.createVector(bytesArray); + final Iterator> it = Factory.createPermutationWithRepetitionGenerator( + bytesVector, + k).iterator(); + + while (it.hasNext()) { + final ICombinatoricsVector bytes = it.next(); + final byte[] bytesPrimitive = ArrayUtils.toPrimitive(bytes.getVector().toArray( + new Byte[] {})); + for (final Stats stats : TEST_ENCODING) { + stats.encodeTimer.resume(); + final String str = stats.encoding.encode(bytesPrimitive); + stats.encodeTimer.suspend(); + final byte[] utf8Encoding = str.getBytes(UTF_8); + final String utfChars = new String( + utf8Encoding, + Charset.forName("UTF-8")); + + stats.decodeTimer.resume(); + final byte[] original = stats.encoding.decode(utfChars); + stats.decodeTimer.suspend(); + if (Arrays.equals( + bytesPrimitive, + original)) { + stats.totalSuccesses++; + } + stats.totalBytes.put( + k, + stats.totalBytes.get(k) + utf8Encoding.length); + stats.totalEncodings.put( + k, + stats.totalEncodings.get(k) + 1); + } + } + } + for (final Stats stats : TEST_ENCODING) { + System.out.println(stats.toString()); + } + } +}