organicmaps/libs/drape/harfbuzz_shaping.cpp at 6ed03fb4dc057b751de97a7a2131720372e188e7 · Osyotr/organicmaps · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#include "drape/harfbuzz_shaping.hpp"

#include "base/assert.hpp"
#include "base/logging.hpp"
#include "base/string_utils.hpp"

#include <array>
#include <sstream>
#include <string>

#include <unicode/ubidi.h>    // ubidi_open, ubidi_setPara
#include <unicode/uscript.h>  // UScriptCode
#include <utf8/unchecked.h>

namespace harfbuzz_shaping
{
namespace
{
// Some Unicode characters may be a part of up to 32 different scripts.
using TScriptsArray = std::array<UScriptCode, 32>;

// Writes the script and the script extensions of the Unicode codepoint.
// Returns the number of written scripts.
int32_t GetScriptExtensions(char32_t codepoint, TScriptsArray & scripts)
{
  // Fill scripts with the script extensions.
  UErrorCode icu_error = U_ZERO_ERROR;
  int32_t const count = uscript_getScriptExtensions(static_cast<UChar32>(codepoint), scripts.data(),
                                                   static_cast<int32_t>(scripts.max_size()), &icu_error);
  if (U_FAILURE(icu_error))
  {
    LOG(LWARNING, ("uscript_getScriptExtensions failed with error", icu_error));
    return 0;
  }

  return count;
}

// Intersects the script extensions set of codepoint with scripts and returns the updated size of the scripts.
// The output result will be a subset of the input result (thus resultSize can only be smaller).
int32_t ScriptSetIntersect(char32_t codepoint, TScriptsArray & inOutScripts, int32_t inOutScriptsCount)
{
  // Each codepoint has a Script property and a Script Extensions (Scx) property.
  //
  // The implicit Script property values 'Common' and 'Inherited' indicate that a codepoint is widely used in many
  // scripts, rather than being associated to a specific script.
  //
  // However, some codepoints that are assigned a value of 'Common' or 'Inherited' are not commonly used with all
  // scripts, but rather only with a limited set of scripts. The Script Extension property is used to specify the set
  // of script which borrow the codepoint.
  //
  // Calls to GetScriptExtensions(...) return the set of scripts where the codepoints can be used.
  // (see table 7 from http://www.unicode.org/reports/tr24/tr24-29.html)
  //
  //     Script       Script Extensions ->  Results
  //  1) Common       {Common}          ->  {Common}
  //     Inherited    {Inherited}       ->  {Inherited}
  //  2) Latin        {Latn}            ->  {Latn}
  //     Inherited    {Latn}            ->  {Latn}
  //  3) Common       {Hira Kana}       ->  {Hira Kana}
  //     Inherited    {Hira Kana}       ->  {Hira Kana}
  //  4) Devanagari   {Deva Dogr Kthi Mahj}  ->  {Deva Dogr Kthi Mahj}
  //     Myanmar      {Cakm Mymr Tale}  ->  {Cakm Mymr Tale}
  //
  // For most of the codepoints, the script extensions set contains only one element. For CJK codepoints, it's common
  // to see 3-4 scripts. For really rare cases, the set can go above 20 scripts.
  TScriptsArray codepointScripts;
  size_t const codepointScriptsCount = static_cast<size_t>(GetScriptExtensions(codepoint, codepointScripts));

  // Implicit script 'inherited' is inheriting scripts from preceding codepoint.
  if (codepointScriptsCount == 1 && codepointScripts[0] == USCRIPT_INHERITED)
    return inOutScriptsCount;

  auto const contains = [&codepointScripts, codepointScriptsCount](UScriptCode code)
  {
    for (size_t i = 0; i < codepointScriptsCount; ++i)
      if (codepointScripts[i] == code)
        return true;

    return false;
  };

  // Intersect both script sets.
  ASSERT(!contains(USCRIPT_INHERITED), ());
  size_t outSize = 0;
  for (size_t i = 0; i < inOutScriptsCount; ++i)
  {
    auto const currentScript = inOutScripts[i];
    if (contains(currentScript))
      inOutScripts[outSize++] = currentScript;
  }

  return outSize;
}

// Find the longest sequence of characters from 0 and up to length that have at least one common UScriptCode value.
// Writes the common script value to script and returns the length of the sequence. Takes the characters' script
// extensions into account. http://www.unicode.org/reports/tr24/#ScriptX
//
// Consider 3 characters with the script values {Kana}, {Hira, Kana}, {Kana}. Without script extensions only the first
// script in each set would be taken into account, resulting in 3 segments where 1 would be enough.
int32_t ScriptInterval(std::u16string const & text, int32_t start, int32_t length, UScriptCode & outScript)
{
  ASSERT_GREATER(length, 0U, ());

  auto const begin = text.begin() + start;
  auto const end = text.begin() + start + length;
  auto iterator = begin;

  auto c32 = utf8::unchecked::next16(iterator);

  TScriptsArray scripts;
  int32_t scriptsSize = GetScriptExtensions(c32, scripts);

  while (iterator != end)
  {
    auto prev = iterator;
    c32 = utf8::unchecked::next16(iterator);
    scriptsSize = ScriptSetIntersect(c32, scripts, scriptsSize);
    if (scriptsSize == 0)
    {
      length = static_cast<int32_t>(prev - begin);
      break;
    }
  }

  outScript = scripts[0];
  return length;
}

// A copy of hb_icu_script_to_script to avoid direct ICU dependency.
hb_script_t ICUScriptToHarfbuzzScript(UScriptCode script)
{
  if (script == USCRIPT_INVALID_CODE)
    return HB_SCRIPT_INVALID;
  return hb_script_from_string(uscript_getShortName(script), -1);
}

void GetSingleTextLineRuns(TextSegments & segments)
{
  auto const & text = segments.m_text;
  auto const textLength = static_cast<int32_t>(text.length());

  // Deliberately not checking for nullptr.
  thread_local UBiDi * const bidi = ubidi_open();
  UErrorCode error = U_ZERO_ERROR;
  ::ubidi_setPara(bidi, text.data(), textLength, UBIDI_DEFAULT_LTR, nullptr, &error);
  if (U_FAILURE(error))
  {
    LOG(LERROR, ("ubidi_setPara failed with code", error));
    segments.m_segments.emplace_back(0, 0, HB_SCRIPT_UNKNOWN, HB_DIRECTION_INVALID);
    return;
  }

  // Split the original text by logical runs, then each logical run by common script and each sequence at special
  // characters and style boundaries. This invariant holds: bidiRunStart <= scriptRunStart <= breakingRunStart
  // <= breakingRunEnd <= scriptRunStart <= bidiRunEnd. AB: Breaking runs are dropped now, they may not be needed.
  for (int32_t bidiRunStart = 0; bidiRunStart < textLength;)
  {
    // Determine the longest logical run (e.g. same bidi direction) from this point.
    int32_t bidiRunBreak = 0;
    UBiDiLevel bidiLevel = 0;
    ::ubidi_getLogicalRun(bidi, bidiRunStart, &bidiRunBreak, &bidiLevel);
    int32_t const bidiRunEnd = bidiRunBreak;
    ASSERT_LESS(bidiRunStart, bidiRunEnd, ());

    for (int32_t scriptRunStart = bidiRunStart; scriptRunStart < bidiRunEnd;)
    {
      // Find the longest sequence of characters that have at least one common UScriptCode value.
      UScriptCode script = USCRIPT_INVALID_CODE;
      int32_t const scriptRunEnd =
          ScriptInterval(segments.m_text, scriptRunStart, bidiRunEnd - scriptRunStart, script) + scriptRunStart;
      ASSERT_LESS(scriptRunStart, base::asserted_cast<int32_t>(scriptRunEnd), ());

      // TODO(AB): May need to break on different unicode blocks, parentheses, and control chars (spaces).

      // TODO(AB): Support vertical layouts if necessary.
      segments.m_segments.emplace_back(scriptRunStart, scriptRunEnd - scriptRunStart, ICUScriptToHarfbuzzScript(script),
                                       bidiLevel & 0x01 ? HB_DIRECTION_RTL : HB_DIRECTION_LTR);

      // Move to the next script sequence.
      scriptRunStart = scriptRunEnd;
    }
    // Move to the next direction sequence.
    bidiRunStart = bidiRunEnd;
  }
}

void ReorderRTL(TextSegments & segments)
{
  // TODO(AB): Optimize implementation to use indexes to segments instead of copying them.
  auto it = segments.m_segments.begin();
  auto const end = segments.m_segments.end();
  // TODO(AB): Line (default rendering) direction is determined by the first segment. It should be defined as
  // a parameter depending on the language.
  auto const lineDirection = it->m_direction;
  while (it != end)
  {
    if (it->m_direction == lineDirection)
      ++it;
    else
    {
      auto const start = it++;
      while (it != end && it->m_direction != lineDirection)
        ++it;
      std::reverse(start, it);
    }
  }
  if (lineDirection != HB_DIRECTION_LTR)
    std::reverse(segments.m_segments.begin(), end);
}
}  // namespace

TextSegments GetTextSegments(std::string_view utf8)
{
  ASSERT(!utf8.empty(), ("Shaping of empty strings is not supported"));
  ASSERT(std::string::npos == utf8.find_first_of("\r\n"), ("Shaping with line breaks is not supported", utf8));

  // TODO(AB): Can unnecessary conversion/allocation be avoided?
  TextSegments segments{strings::ToUtf16(utf8), {}};
  // TODO(AB): Runs are not split by breaking chars and by different fonts.
  GetSingleTextLineRuns(segments);
  ReorderRTL(segments);
  return segments;
}

std::string DebugPrint(TextSegment const & segment)
{
  std::stringstream ss;
  ss << "TextSegment[start=" << segment.m_start << ", length=" << segment.m_length << ", script=" << segment.m_script
     << ", direction=" << segment.m_direction << ']';
  return ss.str();
}

}  // namespace harfbuzz_shaping