Skip to content

Commit 6f88f0d

Browse files
committed
Added support for decoding of high and low surrogates in UTF-16 (#5)
1 parent 624f2b5 commit 6f88f0d

File tree

1 file changed

+51
-25
lines changed

1 file changed

+51
-25
lines changed

src/dab/constants/charsets.cpp

Lines changed: 51 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <stddef.h>
55
#include <stdint.h>
66
#include <uchar.h>
7+
#include <optional>
78
#include "utility/span.h"
89
#include <fmt/format.h>
910
#include "../dab_logging.h"
@@ -64,45 +65,70 @@ static std::string convert_utf16_to_utf8(tcb::span<const uint8_t> utf16_string)
6465
// High surrogates U+D800 - U+DB7F
6566
// High private use U+DB80 - U+DBFF
6667
// Low surrogates U+DC00 - U+DFFF
67-
// https://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates
68-
// A pair of high and low surrogates addresses U+010000-U+100000 according to the equation
69-
// C = 0x10000 + (H-0xD800)*0x0400 + (L-0xDC00)
7068

7169
size_t total_utf16_bytes = utf16_string.size();
7270
if (total_utf16_bytes % 2 != 0) total_utf16_bytes--; // round to 16bits
71+
7372
// represent utf16 2byte codepoints with utf8 continuation bytes
74-
size_t total_utf8_bytes = 0;
75-
for (size_t i = 0; i < total_utf16_bytes; i+=2) {
76-
const uint16_t c = uint16_t(utf16_string[i]) << 8 | uint16_t(utf16_string[i+1]); // big endian
77-
if (c <= 0x007F) total_utf8_bytes += 1;
78-
else if (c <= 0x07FF) total_utf8_bytes += 2;
79-
else if (c >= 0x2FE0 && c <= 0x2FEF) continue; // ignore gap in BMP
80-
else if (c >= 0xD800 && c <= 0xDFFF) continue; // TODO: handle surrogates
81-
else total_utf8_bytes += 3;
82-
}
83-
std::string utf8_string(total_utf8_bytes, '\0');
84-
size_t j = 0;
73+
std::string utf8_string;
74+
utf8_string.reserve(total_utf16_bytes);
75+
76+
std::optional<uint16_t> high_surrogate = std::nullopt;
8577
for (size_t i = 0; i < total_utf16_bytes; i+=2) {
8678
const uint16_t c = uint16_t(utf16_string[i]) << 8 | uint16_t(utf16_string[i+1]); // big endian
79+
80+
// https://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates
81+
// A pair of high and low surrogates addresses U+010000-U+100000 according to the equation
82+
// C = 0x10000 + (H-0xD800)*0x0400 + (L-0xDC00)
83+
if (high_surrogate != std::nullopt) {
84+
if (c >= 0xDC00 && c <= 0xDFFF) {
85+
const uint32_t H = uint32_t(high_surrogate.value());
86+
const uint32_t L = uint32_t(c);
87+
const uint32_t C = 0x10000 + (H-0xD800)*0x0400 + (L-0xDC00);
88+
// 1111_0xxx, 10xx_xxxx, 10xx_xxxx, 10xx_xxxx
89+
utf8_string.push_back(0b1111'0000 | uint8_t((C & 0b0001'1100'0000'0000'0000'0000) >> 18));
90+
utf8_string.push_back(0b1000'0000 | uint8_t((C & 0b0000'0011'1111'0000'0000'0000) >> 12));
91+
utf8_string.push_back(0b1000'0000 | uint8_t((C & 0b0000'0000'0000'1111'1100'0000) >> 6));
92+
utf8_string.push_back(0b1000'0000 | uint8_t((C & 0b0000'0000'0000'0000'0011'1111) >> 0));
93+
high_surrogate = std::nullopt;
94+
continue;
95+
} else if (c >= 0xD800 && c <= 0xDBFF) {
96+
LOG_ERROR(
97+
"high surrogate received twice in a row, first={:02x}, second={:02x}",
98+
high_surrogate.value(), c
99+
);
100+
// override original first high surrogate assuming the previous one was a fluke
101+
high_surrogate = c;
102+
continue;
103+
} else {
104+
LOG_ERROR(
105+
"surrogate pair missing low surrogate, high_surrogate={:02x}, bad_low_surrogate={:02x}",
106+
high_surrogate.value(), c
107+
);
108+
// isolated surrogates should be ignored and codepoint processed as normal
109+
high_surrogate = std::nullopt;
110+
// @fallthrough
111+
}
112+
}
113+
87114
if (c <= 0x007F) {
88115
// 0xxx_xxxx
89-
utf8_string[j] = uint8_t(c & 0x007F);
90-
j += 1;
116+
utf8_string.push_back(uint8_t(c & 0x007F));
91117
} else if (c <= 0x07FF) {
92118
// 110x_xxxx, 10xx_xxxx
93-
utf8_string[j] = 0b1100'0000 | uint8_t((c & 0b0000'0111'1100'0000) >> 6);
94-
utf8_string[j+1] = 0b1000'0000 | uint8_t((c & 0b0000'0000'0011'1111) >> 0);
95-
j += 2;
119+
utf8_string.push_back(0b1100'0000 | uint8_t((c & 0b0000'0111'1100'0000) >> 6));
120+
utf8_string.push_back(0b1000'0000 | uint8_t((c & 0b0000'0000'0011'1111) >> 0));
96121
} else if (c >= 0x2FE0 && c <= 0x2FEF) {
97122
// ignore gap in BMP
98-
} else if (c >= 0xD800 && c <= 0xDFFF) {
99-
// TODO: handle surrogates
123+
} else if (c >= 0xD800 && c <= 0xDBFF) {
124+
high_surrogate = c;
125+
} else if (c >= 0xDC00 && c <= 0xDFFF) {
126+
LOG_ERROR("got low surrogate first instead of high surrogate {:02x}", c);
100127
} else {
101128
// 1110_xxxx, 10xx_xxxx, 10xx_xxxx
102-
utf8_string[j] = 0b1110'0000 | uint8_t((c & 0b1111'0000'0000'0000) >> 12);
103-
utf8_string[j+1] = 0b1000'0000 | uint8_t((c & 0b0000'1111'1100'0000) >> 6);
104-
utf8_string[j+2] = 0b1000'0000 | uint8_t((c & 0b0000'0000'0011'1111) >> 0);
105-
j += 3;
129+
utf8_string.push_back(0b1110'0000 | uint8_t((c & 0b1111'0000'0000'0000) >> 12));
130+
utf8_string.push_back(0b1000'0000 | uint8_t((c & 0b0000'1111'1100'0000) >> 6));
131+
utf8_string.push_back(0b1000'0000 | uint8_t((c & 0b0000'0000'0011'1111) >> 0));
106132
}
107133
}
108134
return utf8_string;

0 commit comments

Comments
 (0)