Skip to content

Commit c42d5ca

Browse files
committed
support align when checked UTF8 Latin-1
1 parent 22c1310 commit c42d5ca

File tree

9 files changed

+84
-19
lines changed

9 files changed

+84
-19
lines changed

CodeFormatCore/src/Format/Analyzer/AlignAnalyzer.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ void AlignAnalyzer::AnalyzeArrayTableAlign(FormatState &f, std::vector<LuaSyntax
298298
std::size_t maxAlign = 0;
299299
auto &file = t.GetFile();
300300
for (auto &table: arrayTable) {
301-
if (file.CheckCurrentLineUnicodeBefore(table.GetTextRange(t).GetEndOffset())) {
301+
if (file.CheckNonUniformCharBefore(table.GetTextRange(t).GetEndOffset())) {
302302
return;
303303
}
304304

@@ -320,9 +320,9 @@ void AlignAnalyzer::AnalyzeArrayTableAlign(FormatState &f, std::vector<LuaSyntax
320320
for (std::size_t i = 0; i < maxAlign; i++) {
321321
for (auto &tableFieldArray: arrayTableFieldVec) {
322322
if (i < tableFieldArray.size()) {
323-
auto text = tableFieldArray[i].GetText(t);
324-
if (elementLength < text.size()) {
325-
elementLength = text.size();
323+
auto textLength = tableFieldArray[i].GetUtf8Length(t);
324+
if (elementLength < textLength) {
325+
elementLength = textLength;
326326
}
327327
group.push_back(tableFieldArray[i].GetFirstToken(t).GetIndex());
328328
}
@@ -364,7 +364,7 @@ void AlignAnalyzer::ResolveAlignGroup(FormatState &f, std::size_t groupIndex, Al
364364
if (diff > 2) {
365365
allowAlign = true;
366366
}
367-
if (file.CheckCurrentLineUnicodeBefore(eq.GetTextRange(t).StartOffset)) {
367+
if (file.CheckNonUniformCharBefore(eq.GetTextRange(t).StartOffset)) {
368368
return;
369369
}
370370
}
@@ -377,7 +377,7 @@ void AlignAnalyzer::ResolveAlignGroup(FormatState &f, std::size_t groupIndex, Al
377377
if (eq.IsToken(t)) {
378378
auto prev = eq.GetPrevToken(t);
379379
auto prevSpace = f.GetStyle().space_around_assign_operator == SpaceAroundStyle::Always ? 2 : 1;
380-
auto newPos = prev.GetTextRange(t).GetEndOffset() + prevSpace - node.GetTextRange(t).StartOffset;
380+
auto newPos = prev.GetEndCol(t) + prevSpace - node.GetStartCol(t);
381381
if (newPos > maxDis) {
382382
maxDis = newPos;
383383
}
@@ -396,7 +396,7 @@ void AlignAnalyzer::ResolveAlignGroup(FormatState &f, std::size_t groupIndex, Al
396396
auto eq = node.GetChildToken('=', t);
397397
if (eq.IsToken(t)) {
398398
auto prev = eq.GetPrevToken(t);
399-
if (file.CheckCurrentLineUnicodeBefore(eq.GetTextRange(t).StartOffset)) {
399+
if (file.CheckNonUniformCharBefore(eq.GetTextRange(t).StartOffset)) {
400400
return;
401401
}
402402
auto prevSpace = f.GetStyle().space_around_assign_operator == SpaceAroundStyle::Always ? 2 : 1;
@@ -434,7 +434,7 @@ void AlignAnalyzer::ResolveAlignGroup(FormatState &f, std::size_t groupIndex, Al
434434
for (auto i: group.SyntaxGroup) {
435435
auto comment = LuaSyntaxNode(i);
436436
if (comment.IsToken(t)) {
437-
if (file.CheckCurrentLineUnicodeBefore(comment.GetTextRange(t).StartOffset)) {
437+
if (file.CheckNonUniformCharBefore(comment.GetTextRange(t).StartOffset)) {
438438
return;
439439
}
440440

@@ -637,9 +637,9 @@ void AlignAnalyzer::AnalyzeSimilarCallAlign(FormatState &f, std::vector<LuaSynta
637637
for (std::size_t i = 0; i < maxAlign; i++) {
638638
for (auto &args: argsVec) {
639639
if (i < args.size()) {
640-
auto text = args[i].GetText(t);
641-
if (elementLength < text.size()) {
642-
elementLength = text.size();
640+
auto textLength = args[i].GetUtf8Length(t);
641+
if (elementLength < textLength) {
642+
elementLength = textLength;
643643
}
644644
group.push_back(args[i].GetFirstToken(t).GetIndex());
645645
}

CodeFormatCore/src/Format/FormatBuilder.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "CodeFormatCore/Format/Analyzer/IndentationAnalyzer.h"
44
#include "LuaParser/Lexer/LuaTokenTypeDetail.h"
55
#include "Util/StringUtil.h"
6+
#include "Util/Utf8.h"
67

78

89
FormatBuilder::FormatBuilder(LuaStyle &style) {
@@ -34,7 +35,7 @@ void FormatBuilder::WriteSyntaxNode(LuaSyntaxNode &syntaxNode, const LuaSyntaxTr
3435
break;
3536
}
3637
default: {
37-
_state.CurrentWidth() += text.size();
38+
_state.CurrentWidth() += syntaxNode.GetUtf8Length(t);
3839
_formattedText.append(text);
3940
}
4041
}
@@ -365,7 +366,8 @@ void FormatBuilder::WriteText(std::string_view text) {
365366
}
366367

367368
if (text.size() > last) {
368-
_state.CurrentWidth() += text.size() - last;
369+
std::size_t lastLineRest = text.size() - last;
370+
_state.CurrentWidth() += utf8::Utf8nLen(text.data() + last, lastLineRest);
369371
if (last != 0) {
370372
_formattedText.append(text.substr(last));
371373
} else {

LuaParser/include/LuaParser/Ast/LuaSyntaxNode.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ class LuaSyntaxNode {
2727

2828
std::string_view GetText(const LuaSyntaxTree &t) const;
2929

30+
std::size_t GetUtf8Length(const LuaSyntaxTree &t) const;
31+
3032
bool IsNode(const LuaSyntaxTree &t) const;
3133

3234
bool IsToken(const LuaSyntaxTree &t) const;

LuaParser/include/LuaParser/File/LuaSource.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class LuaSource {
1717

1818
std::size_t GetColumn(std::size_t offset) const;
1919

20-
bool CheckCurrentLineUnicodeBefore(std::size_t offset) const;
20+
bool CheckNonUniformCharBefore(std::size_t offset) const;
2121

2222
std::size_t GetLineOffset(std::size_t offset) const;
2323

LuaParser/src/Ast/LuaSyntaxNode.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ std::string_view LuaSyntaxNode::GetText(const LuaSyntaxTree &t) const {
4242
return t.GetFile().Slice(GetTextRange(t));
4343
}
4444

45+
std::size_t LuaSyntaxNode::GetUtf8Length(const LuaSyntaxTree &t) const {
46+
auto text = GetText(t);
47+
return utf8::Utf8nLen(text.data(), text.size());
48+
}
49+
4550
bool LuaSyntaxNode::IsNode(const LuaSyntaxTree &t) const {
4651
return t.IsNode(_index);
4752
}
@@ -290,4 +295,3 @@ std::size_t LuaSyntaxNode::CountNodeChild(LuaSyntaxNodeKind kind, const LuaSynta
290295
bool LuaSyntaxNode::IsEmpty(const LuaSyntaxTree &t) const {
291296
return t.GetFirstChild(_index) == 0;
292297
}
293-

LuaParser/src/File/LuaSource.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,17 @@ std::size_t LuaSource::GetColumn(std::size_t offset) const {
5959
return 0;
6060
}
6161

62-
bool LuaSource::CheckCurrentLineUnicodeBefore(std::size_t offset) const {
63-
auto line = GetLine(offset);
6462

63+
bool LuaSource::CheckNonUniformCharBefore(std::size_t offset) const {
64+
auto line = GetLine(offset);
6565
auto lineStartOffset = _lineOffsetVec[line];
6666

6767
if (offset > lineStartOffset) {
68-
for (std::size_t i = lineStartOffset; i < offset; i++) {
69-
if (_source[i] & 0x80) {
68+
std::size_t byteNum = 0;
69+
for (std::size_t i = lineStartOffset; i < offset; i += byteNum) {
70+
auto codepoint = utf8::Utf8ToUnicode(_source.data() + i, _source.size() - i, byteNum);
71+
// support UTF-8 Basic Latin and Latin-1 Supplement
72+
if (codepoint > 0xff) {
7073
return true;
7174
}
7275
}

Test/src/FormatResult_unitest.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,4 +1099,30 @@ local t <const> = 1
10991099
local t=1
11001100
local t <const> =1
11011101
)", style));
1102+
}
1103+
1104+
TEST(Format, feature_146_support_UTF_8_Basic_Latin_and_Latin_1_Supplement) {
1105+
LuaStyle style;
1106+
1107+
EXPECT_TRUE(TestHelper::TestFormatted(
1108+
R"(
1109+
t["¡¢£¤¥¦§¨©ª«®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"] = 1
1110+
b = 123
1111+
1112+
1113+
only_latin = {
1114+
{ 'aaaa', nil },
1115+
{ 'e', nil },
1116+
}
1117+
)",
1118+
R"(
1119+
t["¡¢£¤¥¦§¨©ª«®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"] = 1
1120+
b = 123
1121+
1122+
1123+
only_latin = {
1124+
{ 'aaaa', nil },
1125+
{ 'e', nil },
1126+
}
1127+
)", style));
11021128
}

Util/include/Util/Utf8.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,6 @@ std::size_t Utf8nByteNum(const char *source, std::size_t maxByteNum, std::size_t
1414
std::size_t Utf8nLenAtFirstLine(const char *source, std::size_t byteNum);
1515

1616
std::size_t Utf8OneCharLen(const char *source);
17+
18+
uint32_t Utf8ToUnicode(const char *source, std::size_t maxNum, std::size_t &byteNum);
1719
}// namespace utf8

Util/src/Utf8.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,3 +117,29 @@ std::size_t utf8::Utf8OneCharLen(const char *source) {
117117
return 1;
118118
}
119119
}
120+
121+
uint32_t utf8::Utf8ToUnicode(const char *source, std::size_t maxNum, std::size_t &byteNum) {
122+
if (maxNum == 0) {
123+
byteNum = 1;
124+
return 0;
125+
}
126+
127+
if (0xf0 == (0xf8 & *source) && maxNum >= 4) {
128+
byteNum = 4;
129+
// 4-byte utf8 code point (began with 0b11110xxx)
130+
return ((source[0] & 0x07) << 18) | ((source[1] & 0x3f) << 12) | ((source[2] & 0x3f) << 6) | (source[3] & 0x3f);
131+
} else if (0xe0 == (0xf0 & *source) && maxNum >= 3) {
132+
byteNum = 3;
133+
// 3-byte utf8 code point (began with 0b1110xxxx)
134+
return ((source[0] & 0x0f) << 12) | ((source[1] & 0x3f) << 6) | (source[2] & 0x3f);
135+
} else if (0xc0 == (0xe0 & *source) && maxNum >= 2) {
136+
byteNum = 2;
137+
// 2-byte utf8 code point (began with 0b110xxxxx)
138+
return ((source[0] & 0x1f) << 6) | (source[1] & 0x3f);
139+
} else {
140+
byteNum = 1;
141+
// if (0x00 == (0x80 & *s)) {
142+
// 1-byte ascii (began with 0b0xxxxxxx)
143+
return source[0];
144+
}
145+
}

0 commit comments

Comments
 (0)