|
| 1 | +/************************************************************\ |
| 2 | + * Copyright 2023 Lawrence Livermore National Security, LLC |
| 3 | + * (c.f. AUTHORS, NOTICE.LLNS, COPYING) |
| 4 | + * |
| 5 | + * This file is part of the Flux resource manager framework. |
| 6 | + * For details, see https://github.com/flux-framework. |
| 7 | + * |
| 8 | + * SPDX-License-Identifier: LGPL-3.0 |
| 9 | +\************************************************************/ |
| 10 | + |
| 11 | +/* basemoji.c - an emoji encoding for unsigned 64 bit integers |
| 12 | + */ |
| 13 | + |
| 14 | +#if HAVE_CONFIG_H |
| 15 | +#include "config.h" |
| 16 | +#endif |
| 17 | + |
| 18 | +#include <stdio.h> |
| 19 | +#include <stdlib.h> |
| 20 | +#include <stdint.h> |
| 21 | +#include <string.h> |
| 22 | +#include <errno.h> |
| 23 | +#include <stdbool.h> |
| 24 | + |
| 25 | +#include "ccan/array_size/array_size.h" |
| 26 | +#include "basemoji.h" |
| 27 | + |
| 28 | +/* Minimum length of a b576 string is 1 emoji, or 4 bytes */ |
| 29 | +#define BASEMOJI_MINLEN 4 |
| 30 | + |
| 31 | +/* Maximum number of emoji "digits" in a basemoji string is |
| 32 | + * |
| 33 | + * ceil (ln (2^64-1)/ln (576)) = 7 |
| 34 | + * |
| 35 | + * 4 bytes per emoji, so 4*7 = 28 bytes. |
| 36 | + */ |
| 37 | +#define BASEMOJI_MAXLEN 28 |
| 38 | + |
| 39 | +/* The following is a Selection of 576 emoji in CLDR[1] collation order[2] |
| 40 | + * taken from the version 2010 Unicode emoji set[3]. Note: Selected code |
| 41 | + * points are all represented in 4 bytes, which is assumed in the |
| 42 | + * implementation in this module. Additionally, every character in this |
| 43 | + * selected set has a common first two bytes of F0 9F in UTF-8 encoding, |
| 44 | + * which aids in detection of a valid basemoji string. |
| 45 | + * |
| 46 | + * 1. https://cldr.unicode.org |
| 47 | + * 2. https://unicode.org/emoji/charts-12.1/emoji-ordering.txt |
| 48 | + * 3. https://unicode.org/emoji/charts/emoji-versions.html |
| 49 | + * |
| 50 | + */ |
| 51 | +const char *emojis[] = { |
| 52 | +"😃", "😄", "😁", "😆", "😅", "😂", "😉", "😊", "😍", "😘", "😚", "😋", |
| 53 | +"😜", "😝", "😏", "😒", "😌", "😔", "😪", "😷", "😵", "😲", "😳", "😨", |
| 54 | +"😰", "😥", "😢", "😭", "😱", "😖", "😣", "😞", "😓", "😩", "😫", "😤", |
| 55 | +"😡", "😠", "👿", "💀", "💩", "👹", "👺", "👻", "👽", "👾", "😺", "😸", |
| 56 | +"😹", "😻", "😼", "😽", "🙀", "😿", "😾", "🙈", "🙉", "🙊", "💌", "💘", |
| 57 | +"💝", "💖", "💗", "💓", "💞", "💕", "💟", "💔", "💛", "💚", "💙", "💜", |
| 58 | +"💋", "💯", "💢", "💥", "💫", "💦", "💨", "💬", "💤", "👋", "👌", "👈", |
| 59 | +"👉", "👆", "👇", "👍", "👎", "👊", "👏", "🙌", "👐", "🙏", "💅", "💪", |
| 60 | +"👂", "👃", "👀", "👅", "👄", "👶", "👦", "👧", "👱", "👨", "👩", "👴", |
| 61 | +"👵", "🙍", "🙎", "🙅", "🙆", "💁", "🙋", "🙇", "👮", "💂", "👷", "👸", |
| 62 | +"👳", "👲", "👰", "👼", "🎅", "💆", "💇", "🚶", "🏃", "💃", "👯", "🏂", |
| 63 | +"🏄", "🏊", "🛀", "👫", "💏", "💑", "👪", "👤", "👣", "🐵", "🐒", "🐶", |
| 64 | +"🐩", "🐺", "🐱", "🐯", "🐴", "🐎", "🐮", "🐷", "🐗", "🐽", "🐑", "🐫", |
| 65 | +"🐘", "🐭", "🐹", "🐰", "🐻", "🐨", "🐼", "🐾", "🐔", "🐣", "🐤", "🐥", |
| 66 | +"🐦", "🐧", "🐸", "🐢", "🐍", "🐲", "🐳", "🐬", "🐟", "🐠", "🐡", "🐙", |
| 67 | +"🐚", "🐌", "🐛", "🐜", "🐝", "🐞", "💐", "🌸", "💮", "🌹", "🌺", "🌻", |
| 68 | +"🌼", "🌷", "🌱", "🌴", "🌵", "🌾", "🌿", "🍀", "🍁", "🍂", "🍃", "🍄", |
| 69 | +"🍇", "🍈", "🍉", "🍊", "🍌", "🍍", "🍎", "🍏", "🍑", "🍒", "🍓", "🍅", |
| 70 | +"🍆", "🌽", "🌰", "🍞", "🍖", "🍗", "🍔", "🍟", "🍕", "🍳", "🍲", "🍱", |
| 71 | +"🍘", "🍙", "🍚", "🍛", "🍜", "🍝", "🍠", "🍢", "🍣", "🍤", "🍥", "🍡", |
| 72 | +"🍦", "🍧", "🍨", "🍩", "🍪", "🎂", "🍰", "🍫", "🍬", "🍭", "🍮", "🍯", |
| 73 | +"🍵", "🍶", "🍷", "🍸", "🍹", "🍺", "🍻", "🍴", "🔪", "🌏", "🗾", "🌋", |
| 74 | +"🗻", "🏠", "🏡", "🏢", "🏣", "🏥", "🏦", "🏨", "🏩", "🏪", "🏫", "🏬", |
| 75 | +"🏭", "🏯", "🏰", "💒", "🗼", "🗽", "🌁", "🌃", "🌄", "🌅", "🌆", "🌇", |
| 76 | +"🌉", "🎠", "🎡", "🎢", "💈", "🎪", "🚃", "🚄", "🚅", "🚇", "🚉", "🚌", |
| 77 | +"🚑", "🚒", "🚓", "🚕", "🚗", "🚙", "🚚", "🚲", "🚏", "🚨", "🚥", "🚧", |
| 78 | +"🚤", "🚢", "💺", "🚀", "🕛", "🕐", "🕑", "🕒", "🕓", "🕔", "🕕", "🕖", |
| 79 | +"🕗", "🕘", "🕙", "🕚", "🌑", "🌓", "🌔", "🌕", "🌙", "🌛", "🌟", "🌠", |
| 80 | +"🌌", "🌀", "🌈", "🌂", "🔥", "💧", "🌊", "🎃", "🎄", "🎆", "🎇", "🎈", |
| 81 | +"🎉", "🎊", "🎋", "🎍", "🎎", "🎏", "🎐", "🎑", "🎀", "🎁", "🎫", "🏆", |
| 82 | +"🏀", "🏈", "🎾", "🎳", "🎣", "🎽", "🎿", "🎯", "🔫", "🎱", "🔮", "🎮", |
| 83 | +"🎰", "🎲", "🃏", "🀄", "🎴", "🎭", "🎨", "👓", "👔", "👕", "👖", "👗", |
| 84 | +"👘", "👙", "👚", "👛", "👜", "👝", "🎒", "👞", "👟", "👠", "👡", "👢", |
| 85 | +"👑", "👒", "🎩", "🎓", "💄", "💍", "💎", "🔊", "📢", "📣", "🔔", "🎼", |
| 86 | +"🎵", "🎶", "🎤", "🎧", "📻", "🎷", "🎸", "🎹", "🎺", "🎻", "📱", "📲", |
| 87 | +"📞", "📟", "📠", "🔋", "🔌", "💻", "💽", "💾", "💿", "📀", "🎥", "🎬", |
| 88 | +"📺", "📷", "📹", "📼", "🔍", "🔎", "💡", "🔦", "🏮", "📔", "📕", "📖", |
| 89 | +"📗", "📘", "📙", "📚", "📓", "📒", "📃", "📜", "📄", "📰", "📑", "🔖", |
| 90 | +"💰", "💴", "💵", "💸", "💳", "💹", "📧", "📨", "📩", "📤", "📥", "📦", |
| 91 | +"📫", "📪", "📮", "📝", "💼", "📁", "📂", "📅", "📆", "📇", "📈", "📉", |
| 92 | +"📊", "📋", "📌", "📍", "📎", "📏", "📐", "🔒", "🔓", "🔏", "🔐", "🔑", |
| 93 | +"🔨", "💣", "🔧", "🔩", "🔗", "📡", "💉", "💊", "🚪", "🚽", "🚬", "🗿", |
| 94 | +"🏧", "🚹", "🚺", "🚻", "🚼", "🚾", "🚫", "🚭", "🔞", "🔃", "🔙", "🔚", |
| 95 | +"🔛", "🔜", "🔝", "🔯", "🔼", "🔽", "🎦", "📶", "📳", "📴", "💱", "💲", |
| 96 | +"🔱", "📛", "🔰", "🔟", "🔠", "🔡", "🔢", "🔣", "🔤", "🆎", "🆑", "🆒", |
| 97 | +"🆓", "🆔", "🆕", "🆖", "🆗", "🆘", "🆙", "🆚", "🈁", "🈶", "🈯", "🉐", |
| 98 | +"🈹", "🈚", "🈲", "🉑", "🈸", "🈴", "🈳", "🈺", "🈵", "🔴", "🔵", "🔶", |
| 99 | +"🔷", "🔸", "🔹", "🔺", "🔻", "💠", "🔘", "🔳", "🔲", "🏁", "🚩", "🎌", |
| 100 | +}; |
| 101 | + |
| 102 | +bool is_basemoji_string (const char *s) |
| 103 | +{ |
| 104 | + int len = strlen (s); |
| 105 | + |
| 106 | + /* This code assumes length of emoji array is 576 |
| 107 | + * Generate error at build time if this becomes untrue: |
| 108 | + */ |
| 109 | + BUILD_ASSERT(ARRAY_SIZE(emojis) == 576); |
| 110 | + |
| 111 | + /* Check for expected length of a basemoji string, and if the |
| 112 | + * first two bytes match the expected UTF-8 encoding. |
| 113 | + * This doesn't guarantee that `s` is a valid basemoji string, |
| 114 | + * but this will catch most obvious cases and other invalid strings |
| 115 | + * are left to be detected in decode. |
| 116 | + */ |
| 117 | + if (len >= BASEMOJI_MINLEN |
| 118 | + && len <= BASEMOJI_MAXLEN |
| 119 | + && len % 4 == 0 |
| 120 | + && (uint8_t)s[0] == 0xf0 |
| 121 | + && (uint8_t)s[1] == 0x9f) |
| 122 | + return true; |
| 123 | + return false; |
| 124 | +} |
| 125 | + |
| 126 | +/* Encode id into buf in reverse (i.e. higher order bytes are encoded |
| 127 | + * and placed first into 'buf' since we're doing progressive division.) |
| 128 | + */ |
| 129 | +static int emoji_revenc (char *buf, int buflen, uint64_t id) |
| 130 | +{ |
| 131 | + int index = 0; |
| 132 | + memset (buf, 0, buflen); |
| 133 | + if (id == 0) { |
| 134 | + memcpy (buf, emojis[0], 4); |
| 135 | + return 4; |
| 136 | + } |
| 137 | + while (id > 0) { |
| 138 | + int rem = id % 576; |
| 139 | + memcpy (buf+index, emojis[rem], 4); |
| 140 | + index += 4; |
| 141 | + id = id / 576; |
| 142 | + } |
| 143 | + return index; |
| 144 | +} |
| 145 | + |
| 146 | +int uint64_basemoji_encode (uint64_t id, char *buf, int buflen) |
| 147 | +{ |
| 148 | + int count; |
| 149 | + int n; |
| 150 | + char reverse[BASEMOJI_MAXLEN+1]; |
| 151 | + |
| 152 | + if (buf == NULL || buflen <= 0) { |
| 153 | + errno = EINVAL; |
| 154 | + return -1; |
| 155 | + } |
| 156 | + |
| 157 | + /* Encode bytes to emoji (in reverse), which also gives us a count |
| 158 | + * of the total bytes required for this encoding. |
| 159 | + */ |
| 160 | + if ((count = emoji_revenc (reverse, sizeof (reverse), id)) < 0) { |
| 161 | + errno = EINVAL; |
| 162 | + return -1; |
| 163 | + } |
| 164 | + |
| 165 | + /* Check for overflow of provided buffer: |
| 166 | + * Need space for count bytes for emoji + NUL |
| 167 | + */ |
| 168 | + if (count + 1 > buflen) { |
| 169 | + errno = EOVERFLOW; |
| 170 | + return -1; |
| 171 | + } |
| 172 | + |
| 173 | + memset (buf, 0, buflen); |
| 174 | + n = 0; |
| 175 | + |
| 176 | + /* Copy 4-byte emojis back in order so that most significant bits are |
| 177 | + * on the left: |
| 178 | + */ |
| 179 | + for (int i = count - 4; i >= 0; i-=4) { |
| 180 | + memcpy (buf+n, reverse+i, 4); |
| 181 | + n+=4; |
| 182 | + } |
| 183 | + return 0; |
| 184 | +} |
| 185 | + |
| 186 | + |
| 187 | +static int basemoji_lookup (const char *c, int *result) |
| 188 | +{ |
| 189 | + for (int i = 0; i < 576; i++) { |
| 190 | + if (memcmp (c, emojis[i], 4) == 0) { |
| 191 | + *result = i; |
| 192 | + return 0; |
| 193 | + } |
| 194 | + } |
| 195 | + errno = EINVAL; |
| 196 | + return -1; |
| 197 | +} |
| 198 | + |
| 199 | +int uint64_basemoji_decode (const char *str, uint64_t *idp) |
| 200 | +{ |
| 201 | + uint64_t id = 0; |
| 202 | + uint64_t scale = 1; |
| 203 | + int len; |
| 204 | + |
| 205 | + if (str == NULL |
| 206 | + || idp == NULL |
| 207 | + || !is_basemoji_string (str)) { |
| 208 | + errno = EINVAL; |
| 209 | + return -1; |
| 210 | + } |
| 211 | + |
| 212 | + /* Move through basemoji string in reverse since least significant |
| 213 | + * bits are at the end. Since all emoji are 4 bytes, start at 4 from |
| 214 | + * the end to point to the final emoji. |
| 215 | + */ |
| 216 | + len = strlen (str); |
| 217 | + for (int i = len - 4; i >= 0; i-=4) { |
| 218 | + int c; |
| 219 | + if (basemoji_lookup (str+i, &c) < 0) { |
| 220 | + errno = EINVAL; |
| 221 | + return -1; |
| 222 | + } |
| 223 | + id += c * scale; |
| 224 | + scale *= 576; |
| 225 | + } |
| 226 | + *idp = id; |
| 227 | + return 0; |
| 228 | +} |
0 commit comments