Skip to content

Commit 222d181

Browse files
committed
This patch enables the fexec-charset option to control the execution charset of string literals. It sets the default internal charset, system charset, and execution charset for z/OS and UTF-8 for all other platforms.
(cherry picked from commit 0295d0d) (cherry picked from commit e379f6cb9d063cb78c6b48b0e0a8d9f241958f89)
1 parent 6c4d121 commit 222d181

File tree

20 files changed

+375
-48
lines changed

20 files changed

+375
-48
lines changed

clang/docs/LanguageExtensions.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -420,8 +420,7 @@ Builtin Macros
420420
``__clang_literal_encoding__``
421421
Defined to a narrow string literal that represents the current encoding of
422422
narrow string literals, e.g., ``"hello"``. This macro typically expands to
423-
"UTF-8" (but may change in the future if the
424-
``-fexec-charset="Encoding-Name"`` option is implemented.)
423+
the text encoding specified by -fexec-charset if specified, or the system charset.
425424

426425
``__clang_wide_literal_encoding__``
427426
Defined to a narrow string literal that represents the current encoding of

clang/include/clang/Basic/LangOptions.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,9 @@ class LangOptions : public LangOptionsBase {
565565
bool AtomicFineGrainedMemory = false;
566566
bool AtomicIgnoreDenormalMode = false;
567567

568+
/// Name of the exec charset to convert the internal charset to.
569+
std::string ExecCharset;
570+
568571
LangOptions();
569572

570573
/// Set language defaults for the given input language and

clang/include/clang/Basic/TokenKinds.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,13 @@ inline bool isLiteral(TokenKind K) {
111111
return isInLiteralRange;
112112
}
113113

114+
/// Return true if this is a utf literal kind.
115+
inline bool isUTFLiteral(TokenKind K) {
116+
return K == tok::utf8_char_constant || K == tok::utf8_string_literal ||
117+
K == tok::utf16_char_constant || K == tok::utf16_string_literal ||
118+
K == tok::utf32_char_constant || K == tok::utf32_string_literal;
119+
}
120+
114121
/// Return true if this is any of tok::annot_* kinds.
115122
bool isAnnotation(TokenKind K);
116123

clang/include/clang/Driver/Options.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7467,6 +7467,11 @@ let Visibility = [CC1Option, CC1AsOption, FC1Option] in {
74677467
def tune_cpu : Separate<["-"], "tune-cpu">,
74687468
HelpText<"Tune for a specific cpu type">,
74697469
MarshallingInfoString<TargetOpts<"TuneCPU">>;
7470+
def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<charset>">,
7471+
HelpText<"Set the execution <charset> for string and character literals. "
7472+
"Supported character encodings include ISO8859-1, UTF-8, IBM-1047 "
7473+
"and those supported by the host icu or iconv library.">,
7474+
MarshallingInfoString<LangOpts<"ExecCharset">>;
74707475
def target_cpu : Separate<["-"], "target-cpu">,
74717476
HelpText<"Target a specific cpu type">,
74727477
MarshallingInfoString<TargetOpts<"CPU">>;
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
//===--- clang/Lex/LiteralConverter.h - Translator for Literals -*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_CLANG_LEX_LITERALCONVERTER_H
10+
#define LLVM_CLANG_LEX_LITERALCONVERTER_H
11+
12+
#include "clang/Basic/Diagnostic.h"
13+
#include "clang/Basic/LangOptions.h"
14+
#include "clang/Basic/TargetInfo.h"
15+
#include "llvm/ADT/StringMap.h"
16+
#include "llvm/ADT/StringRef.h"
17+
#include "llvm/Support/TextEncoding.h"
18+
19+
enum ConversionAction { NoConversion, ToSystemCharset, ToExecCharset };
20+
21+
class LiteralConverter {
22+
llvm::StringRef InternalCharset;
23+
llvm::StringRef SystemCharset;
24+
llvm::StringRef ExecCharset;
25+
llvm::StringMap<llvm::TextEncodingConverter> TextEncodingConverters;
26+
27+
public:
28+
llvm::TextEncodingConverter *getConverter(const char *Codepage);
29+
llvm::TextEncodingConverter *getConverter(ConversionAction Action);
30+
llvm::TextEncodingConverter *createAndInsertCharConverter(const char *To);
31+
void setConvertersFromOptions(const clang::LangOptions &Opts,
32+
const clang::TargetInfo &TInfo,
33+
clang::DiagnosticsEngine &Diags);
34+
};
35+
36+
#endif

clang/include/clang/Lex/LiteralSupport.h

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@
1717
#include "clang/Basic/CharInfo.h"
1818
#include "clang/Basic/LLVM.h"
1919
#include "clang/Basic/TokenKinds.h"
20+
#include "clang/Lex/LiteralConverter.h"
2021
#include "llvm/ADT/APFloat.h"
2122
#include "llvm/ADT/ArrayRef.h"
2223
#include "llvm/ADT/SmallString.h"
2324
#include "llvm/ADT/StringRef.h"
2425
#include "llvm/Support/DataTypes.h"
25-
26+
#include "llvm/Support/TextEncoding.h"
2627
namespace clang {
2728

2829
class DiagnosticsEngine;
@@ -233,6 +234,7 @@ class StringLiteralParser {
233234
const LangOptions &Features;
234235
const TargetInfo &Target;
235236
DiagnosticsEngine *Diags;
237+
LiteralConverter *LiteralConv;
236238

237239
unsigned MaxTokenLength;
238240
unsigned SizeBound;
@@ -246,18 +248,19 @@ class StringLiteralParser {
246248
StringLiteralEvalMethod EvalMethod;
247249

248250
public:
249-
StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP,
250-
StringLiteralEvalMethod StringMethod =
251-
StringLiteralEvalMethod::Evaluated);
251+
StringLiteralParser(
252+
ArrayRef<Token> StringToks, Preprocessor &PP,
253+
StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated,
254+
ConversionAction Action = ToExecCharset);
252255
StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
253256
const LangOptions &features, const TargetInfo &target,
254257
DiagnosticsEngine *diags = nullptr)
255258
: SM(sm), Features(features), Target(target), Diags(diags),
256-
MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
257-
ResultPtr(ResultBuf.data()),
259+
LiteralConv(nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0),
260+
Kind(tok::unknown), ResultPtr(ResultBuf.data()),
258261
EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false),
259262
Pascal(false) {
260-
init(StringToks);
263+
init(StringToks, NoConversion);
261264
}
262265

263266
bool hadError;
@@ -305,7 +308,7 @@ class StringLiteralParser {
305308
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix);
306309

307310
private:
308-
void init(ArrayRef<Token> StringToks);
311+
void init(ArrayRef<Token> StringToks, ConversionAction Action);
309312
bool CopyStringFragment(const Token &Tok, const char *TokBegin,
310313
StringRef Fragment);
311314
void DiagnoseLexingError(SourceLocation Loc);

clang/include/clang/Lex/Preprocessor.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "clang/Basic/TokenKinds.h"
2626
#include "clang/Lex/HeaderSearch.h"
2727
#include "clang/Lex/Lexer.h"
28+
#include "clang/Lex/LiteralConverter.h"
2829
#include "clang/Lex/MacroInfo.h"
2930
#include "clang/Lex/ModuleLoader.h"
3031
#include "clang/Lex/ModuleMap.h"
@@ -163,6 +164,7 @@ class Preprocessor {
163164
std::unique_ptr<ScratchBuffer> ScratchBuf;
164165
HeaderSearch &HeaderInfo;
165166
ModuleLoader &TheModuleLoader;
167+
LiteralConverter LiteralConv;
166168

167169
/// External source of macros.
168170
ExternalPreprocessorSource *ExternalSource;
@@ -1235,6 +1237,7 @@ class Preprocessor {
12351237
SelectorTable &getSelectorTable() { return Selectors; }
12361238
Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
12371239
llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
1240+
LiteralConverter &getLiteralConverter() { return LiteralConv; }
12381241

12391242
void setExternalSource(ExternalPreprocessorSource *Source) {
12401243
ExternalSource = Source;

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include "llvm/Support/FileSystem.h"
5050
#include "llvm/Support/Path.h"
5151
#include "llvm/Support/Process.h"
52+
#include "llvm/Support/TextEncoding.h"
5253
#include "llvm/Support/YAMLParser.h"
5354
#include "llvm/TargetParser/AArch64TargetParser.h"
5455
#include "llvm/TargetParser/ARMTargetParserCommon.h"
@@ -7416,12 +7417,20 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
74167417
<< value;
74177418
}
74187419

7419-
// -fexec_charset=UTF-8 is default. Reject others
7420+
// Set the default fexec-charset as the system charset.
7421+
CmdArgs.push_back("-fexec-charset");
7422+
CmdArgs.push_back(Args.MakeArgString(Triple.getSystemCharset()));
74207423
if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
74217424
StringRef value = execCharset->getValue();
7422-
if (!value.equals_insensitive("utf-8"))
7423-
D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args)
7424-
<< value;
7425+
llvm::ErrorOr<llvm::TextEncodingConverter> ErrorOrConverter =
7426+
llvm::TextEncodingConverter::create("UTF-8", value.data());
7427+
if (ErrorOrConverter) {
7428+
CmdArgs.push_back("-fexec-charset");
7429+
CmdArgs.push_back(Args.MakeArgString(value));
7430+
} else {
7431+
D.Diag(diag::err_drv_invalid_value)
7432+
<< execCharset->getAsString(Args) << value;
7433+
}
74257434
}
74267435

74277436
RenderDiagnosticsOptions(D, Args, CmdArgs);

clang/lib/Frontend/CompilerInstance.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "clang/Frontend/Utils.h"
3333
#include "clang/Frontend/VerifyDiagnosticConsumer.h"
3434
#include "clang/Lex/HeaderSearch.h"
35+
#include "clang/Lex/LiteralConverter.h"
3536
#include "clang/Lex/Preprocessor.h"
3637
#include "clang/Lex/PreprocessorOptions.h"
3738
#include "clang/Sema/CodeCompleteConsumer.h"
@@ -543,6 +544,9 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
543544

544545
if (GetDependencyDirectives)
545546
PP->setDependencyDirectivesGetter(*GetDependencyDirectives);
547+
548+
PP->getLiteralConverter().setConvertersFromOptions(getLangOpts(), getTarget(),
549+
getDiagnostics());
546550
}
547551

548552
std::string CompilerInstance::getSpecificModuleCachePath(StringRef ModuleHash) {

clang/lib/Frontend/InitPreprocessor.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1022,10 +1022,14 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
10221022
}
10231023
}
10241024

1025-
// Macros to help identify the narrow and wide character sets
1026-
// FIXME: clang currently ignores -fexec-charset=. If this changes,
1027-
// then this may need to be updated.
1028-
Builder.defineMacro("__clang_literal_encoding__", "\"UTF-8\"");
1025+
// Macros to help identify the narrow and wide character sets. This is set
1026+
// to fexec-charset. If fexec-charset is not specified, the default is the
1027+
// system charset.
1028+
if (!LangOpts.ExecCharset.empty())
1029+
Builder.defineMacro("__clang_literal_encoding__", LangOpts.ExecCharset);
1030+
else
1031+
Builder.defineMacro("__clang_literal_encoding__",
1032+
TI.getTriple().getSystemCharset());
10291033
if (TI.getTypeWidth(TI.getWCharType()) >= 32) {
10301034
// FIXME: 32-bit wchar_t signals UTF-32. This may change
10311035
// if -fwide-exec-charset= is ever supported.

0 commit comments

Comments
 (0)