diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 495f2ab3926ce..bcb80c1d6cb76 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -419,9 +419,9 @@ Builtin Macros ``__clang_literal_encoding__`` Defined to a narrow string literal that represents the current encoding of - narrow string literals, e.g., ``"hello"``. This macro typically expands to - "UTF-8" (but may change in the future if the - ``-fexec-charset="Encoding-Name"`` option is implemented.) + narrow string literals, e.g., ``"hello"``. This macro expands to the text + encoding specified by ``-fexec-charset`` if any, or a system-specific default + otherwise: ``"IBM-1047"`` on z/OS and ``"UTF-8"`` on all other systems. ``__clang_wide_literal_encoding__`` Defined to a narrow string literal that represents the current encoding of diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td index 9e344160ff934..2c939be5b82e9 100644 --- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td +++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td @@ -342,7 +342,8 @@ def err_non_default_visibility_dllimport : Error< "non-default visibility cannot be applied to 'dllimport' declaration">; def err_ifunc_resolver_return : Error< "ifunc resolver function must return a pointer">; - +def err_fe_literal_conv_config : Error< + "failed to configure the literal converter">; def warn_atomic_op_misaligned : Warning< "misaligned atomic operation may incur " "significant performance penalty" diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index c7fe6e1db6d1f..11fa70cbe667e 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -279,6 +279,8 @@ def ext_string_too_long : Extension<"string literal of length %0 exceeds " "support">, InGroup; def err_character_too_large : Error< "character too large for enclosing character literal type">; +def err_exec_charset_conversion_failed : Error< + "conversion to execution encoding failed: '%0'">; def warn_c99_compat_unicode_literal : Warning< "unicode literals are incompatible with C99">, InGroup, DefaultIgnore; diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 8aa89d8c8c807..b3bb398643ba2 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -573,6 +573,9 @@ class LangOptions : public LangOptionsBase { /// The allocation token mode. std::optional AllocTokenMode; + /// Name of the execution encoding to convert the internal encoding to. + std::string ExecEncoding; + LangOptions(); /// Set language defaults for the given input language and diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h index d84f3598cbf33..27c3ecf7f4ae2 100644 --- a/clang/include/clang/Basic/TokenKinds.h +++ b/clang/include/clang/Basic/TokenKinds.h @@ -111,6 +111,18 @@ inline bool isLiteral(TokenKind K) { return isInLiteralRange; } +/// Return true if this is a UTF literal kind. +inline bool isUTFLiteral(TokenKind K) { + return K == tok::utf8_char_constant || K == tok::utf8_string_literal || + K == tok::utf16_char_constant || K == tok::utf16_string_literal || + K == tok::utf32_char_constant || K == tok::utf32_string_literal; +} + +/// Return true if this is a wide literal kind. +inline bool isWideLiteral(TokenKind K) { + return K == tok::wide_char_constant || K == tok::wide_string_literal; +} + /// Return true if this is any of tok::annot_* kinds. bool isAnnotation(TokenKind K); diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 8784c9d7d206d..f16b734101c77 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2967,7 +2967,10 @@ def fexperimental_strict_floating_point : Flag<["-"], "fexperimental-strict-floa def finput_charset_EQ : Joined<["-"], "finput-charset=">, Visibility<[ClangOption, FlangOption, FC1Option]>, Group, HelpText<"Specify the default character set for source files">; -def fexec_charset_EQ : Joined<["-"], "fexec-charset=">, Group; +def fexec_charset_EQ : Joined<["-"], "fexec-charset=">, Group, + HelpText<"Set the execution for string and character literals. " + "Supported character encodings include ISO-8859-1, UTF-8, IBM1047, " + "and possibly those supported by ICU or the host iconv library.">; def finstrument_functions : Flag<["-"], "finstrument-functions">, Group, @@ -7473,6 +7476,11 @@ let Visibility = [CC1Option, CC1AsOption, FC1Option] in { def tune_cpu : Separate<["-"], "tune-cpu">, HelpText<"Tune for a specific cpu type">, MarshallingInfoString>; +def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"">, + HelpText<"Set the execution for string and character literals. " + "Supported character encodings include ISO-8859-1, UTF-8, IBM1047, " + "and possibly those supported by ICU or the host iconv library.">, + MarshallingInfoString>; def target_cpu : Separate<["-"], "target-cpu">, HelpText<"Target a specific cpu type">, MarshallingInfoString>; @@ -9078,7 +9086,9 @@ def _SLASH_source_charset : CLCompileJoined<"source-charset:">, HelpText<"Set source encoding, supports only UTF-8">, Alias; def _SLASH_execution_charset : CLCompileJoined<"execution-charset:">, - HelpText<"Set runtime encoding, supports only UTF-8">, + HelpText<"Set the execution for string and character literals. " + "Supported character encodings include ISO-8859-1, UTF-8, IBM1047, " + "and possibly those supported by ICU or the host iconv library.">, Alias; def _SLASH_std : CLCompileJoined<"std:">, HelpText<"Set language version (c++14,c++17,c++20,c++23preview,c++latest,c11,c17)">; diff --git a/clang/include/clang/Lex/LiteralConverter.h b/clang/include/clang/Lex/LiteralConverter.h new file mode 100644 index 0000000000000..6a66d2d0ff707 --- /dev/null +++ b/clang/include/clang/Lex/LiteralConverter.h @@ -0,0 +1,40 @@ +//===--- clang/Lex/LiteralConverter.h - Translator for Literals -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LEX_LITERALCONVERTER_H +#define LLVM_CLANG_LEX_LITERALCONVERTER_H + +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/TargetInfo.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/TextEncoding.h" + +enum ConversionAction { + CA_NoConversion, + CA_ToSystemEncoding, + CA_ToExecEncoding +}; + +class LiteralConverter { + llvm::StringRef InternalEncoding; + llvm::StringRef SystemEncoding; + llvm::StringRef ExecEncoding; + llvm::TextEncodingConverter *ToSystemEncodingConverter = nullptr; + llvm::TextEncodingConverter *ToExecEncodingConverter = nullptr; + +public: + llvm::TextEncodingConverter *getConverter(ConversionAction Action); + static std::error_code + setConvertersFromOptions(LiteralConverter &LiteralConv, + const clang::LangOptions &Opts, + const clang::TargetInfo &TInfo); +}; + +#endif diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h index ea5f63bc20399..32ae829096592 100644 --- a/clang/include/clang/Lex/LiteralSupport.h +++ b/clang/include/clang/Lex/LiteralSupport.h @@ -17,12 +17,13 @@ #include "clang/Basic/CharInfo.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/TokenKinds.h" +#include "clang/Lex/LiteralConverter.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/DataTypes.h" - +#include "llvm/Support/TextEncoding.h" namespace clang { class DiagnosticsEngine; @@ -233,6 +234,7 @@ class StringLiteralParser { const LangOptions &Features; const TargetInfo &Target; DiagnosticsEngine *Diags; + LiteralConverter *LiteralConv; unsigned MaxTokenLength; unsigned SizeBound; @@ -246,18 +248,19 @@ class StringLiteralParser { StringLiteralEvalMethod EvalMethod; public: - StringLiteralParser(ArrayRef StringToks, Preprocessor &PP, - StringLiteralEvalMethod StringMethod = - StringLiteralEvalMethod::Evaluated); + StringLiteralParser( + ArrayRef StringToks, Preprocessor &PP, + StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated, + ConversionAction Action = CA_ToExecEncoding); StringLiteralParser(ArrayRef StringToks, const SourceManager &sm, const LangOptions &features, const TargetInfo &target, DiagnosticsEngine *diags = nullptr) : SM(sm), Features(features), Target(target), Diags(diags), - MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), - ResultPtr(ResultBuf.data()), + LiteralConv(nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0), + Kind(tok::unknown), ResultPtr(ResultBuf.data()), EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false), Pascal(false) { - init(StringToks); + init(StringToks, CA_NoConversion); } bool hadError; @@ -305,7 +308,7 @@ class StringLiteralParser { static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix); private: - void init(ArrayRef StringToks); + void init(ArrayRef StringToks, ConversionAction Action); bool CopyStringFragment(const Token &Tok, const char *TokBegin, StringRef Fragment); void DiagnoseLexingError(SourceLocation Loc); diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 39754847a93e4..8dd27bc414f80 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -25,6 +25,7 @@ #include "clang/Basic/TokenKinds.h" #include "clang/Lex/HeaderSearch.h" #include "clang/Lex/Lexer.h" +#include "clang/Lex/LiteralConverter.h" #include "clang/Lex/MacroInfo.h" #include "clang/Lex/ModuleLoader.h" #include "clang/Lex/ModuleMap.h" @@ -163,6 +164,7 @@ class Preprocessor { std::unique_ptr ScratchBuf; HeaderSearch &HeaderInfo; ModuleLoader &TheModuleLoader; + LiteralConverter LiteralConv; /// External source of macros. ExternalPreprocessorSource *ExternalSource; @@ -1235,6 +1237,7 @@ class Preprocessor { SelectorTable &getSelectorTable() { return Selectors; } Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; } llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; } + LiteralConverter &getLiteralConverter() { return LiteralConv; } void setExternalSource(ExternalPreprocessorSource *Source) { ExternalSource = Source; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 79edc561c551f..8fd7f7f0d12e9 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -49,6 +49,7 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/Support/Process.h" +#include "llvm/Support/TextEncoding.h" #include "llvm/Support/YAMLParser.h" #include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/TargetParser/ARMTargetParserCommon.h" @@ -7363,12 +7364,24 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, << value; } - // -fexec_charset=UTF-8 is default. Reject others - if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) { - StringRef value = execCharset->getValue(); - if (!value.equals_insensitive("utf-8")) - D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args) - << value; + if (Arg *execEncoding = Args.getLastArg(options::OPT_fexec_charset_EQ)) { + StringRef value = execEncoding->getValue(); + bool KnownEncoding = + llvm::TextEncodingConverter::getKnownEncoding(value).has_value(); + if (!KnownEncoding) { + llvm::ErrorOr ErrorOrConverter = + llvm::TextEncodingConverter::create("UTF-8", value.data()); + if (!ErrorOrConverter) + D.Diag(diag::err_drv_invalid_value) + << execEncoding->getAsString(Args) << value; + } + CmdArgs.push_back("-fexec-charset"); + CmdArgs.push_back(Args.MakeArgString(value)); + } else { + // Set the default fexec-charset as the system charset. + CmdArgs.push_back("-fexec-charset"); + CmdArgs.push_back( + Args.MakeArgString(Triple.getDefaultNarrowTextEncoding())); } RenderDiagnosticsOptions(D, Args, CmdArgs); diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 6b09f7f9fc1e3..26fa83b8b7bcd 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -32,6 +32,7 @@ #include "clang/Frontend/Utils.h" #include "clang/Frontend/VerifyDiagnosticConsumer.h" #include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/LiteralConverter.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" #include "clang/Sema/CodeCompleteConsumer.h" @@ -541,6 +542,10 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) { if (GetDependencyDirectives) PP->setDependencyDirectivesGetter(*GetDependencyDirectives); + + if (LiteralConverter::setConvertersFromOptions(PP->getLiteralConverter(), + getLangOpts(), getTarget())) + PP->getDiagnostics().Report(clang::diag::err_fe_literal_conv_config); } std::string CompilerInstance::getSpecificModuleCachePath(StringRef ModuleHash) { diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp index ed1169eb06d22..8ad0184a2e5a5 100644 --- a/clang/lib/Frontend/FrontendAction.cpp +++ b/clang/lib/Frontend/FrontendAction.cpp @@ -525,7 +525,9 @@ static SourceLocation ReadOriginalFileName(CompilerInstance &CI, if (T.isAtStartOfLine() || T.getKind() != tok::string_literal) return SourceLocation(); - StringLiteralParser Literal(T, CI.getPreprocessor()); + StringLiteralParser Literal(T, CI.getPreprocessor(), + StringLiteralEvalMethod::Evaluated, + CA_NoConversion); if (Literal.hadError) return SourceLocation(); RawLexer->LexFromRawLexer(T); diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 47f1d5a6b636c..2a1f106517e78 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1022,10 +1022,14 @@ static void InitializePredefinedMacros(const TargetInfo &TI, } } - // Macros to help identify the narrow and wide character sets - // FIXME: clang currently ignores -fexec-charset=. If this changes, - // then this may need to be updated. - Builder.defineMacro("__clang_literal_encoding__", "\"UTF-8\""); + // Macros to help identify the narrow and wide character sets. This is set + // to fexec-charset. If fexec-charset is not specified, the default is the + // system charset. + if (!LangOpts.ExecEncoding.empty()) + Builder.defineMacro("__clang_literal_encoding__", LangOpts.ExecEncoding); + else + Builder.defineMacro("__clang_literal_encoding__", + TI.getTriple().getDefaultNarrowTextEncoding()); if (TI.getTypeWidth(TI.getWCharType()) >= 32) { // FIXME: 32-bit wchar_t signals UTF-32. This may change // if -fwide-exec-charset= is ever supported. diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt index f61737cd68021..9e38a1b8fbb44 100644 --- a/clang/lib/Lex/CMakeLists.txt +++ b/clang/lib/Lex/CMakeLists.txt @@ -12,6 +12,7 @@ add_clang_library(clangLex InitHeaderSearch.cpp Lexer.cpp LexHLSLRootSignature.cpp + LiteralConverter.cpp LiteralSupport.cpp MacroArgs.cpp MacroInfo.cpp diff --git a/clang/lib/Lex/LiteralConverter.cpp b/clang/lib/Lex/LiteralConverter.cpp new file mode 100644 index 0000000000000..2bd177d499b87 --- /dev/null +++ b/clang/lib/Lex/LiteralConverter.cpp @@ -0,0 +1,60 @@ +//===--- LiteralConverter.cpp - Translator for String Literals -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/LiteralConverter.h" +#include "clang/Basic/DiagnosticDriver.h" + +using namespace llvm; + +llvm::TextEncodingConverter * +LiteralConverter::getConverter(ConversionAction Action) { + if (Action == CA_ToSystemEncoding) + return ToSystemEncodingConverter; + else if (Action == CA_ToExecEncoding) + return ToExecEncodingConverter; + else + return nullptr; +} + +std::error_code +LiteralConverter::setConvertersFromOptions(LiteralConverter &LiteralConv, + const clang::LangOptions &Opts, + const clang::TargetInfo &TInfo) { + using namespace llvm; + LiteralConv.InternalEncoding = "UTF-8"; + LiteralConv.SystemEncoding = TInfo.getTriple().getDefaultNarrowTextEncoding(); + LiteralConv.ExecEncoding = Opts.ExecEncoding.empty() + ? LiteralConv.InternalEncoding + : Opts.ExecEncoding; + + // Create converter between internal and system encoding + if (LiteralConv.InternalEncoding != LiteralConv.SystemEncoding) { + ErrorOr ErrorOrConverter = + llvm::TextEncodingConverter::create(LiteralConv.InternalEncoding, + LiteralConv.SystemEncoding); + if (ErrorOrConverter) { + LiteralConv.ToSystemEncodingConverter = + new TextEncodingConverter(std::move(*ErrorOrConverter)); + } else + return ErrorOrConverter.getError(); + } + + // Create converter between internal and exec encoding specified + // in fexec-charset option. + if (LiteralConv.InternalEncoding == LiteralConv.ExecEncoding) + return std::error_code(); + ErrorOr ErrorOrConverter = + llvm::TextEncodingConverter::create(LiteralConv.InternalEncoding, + LiteralConv.ExecEncoding); + if (ErrorOrConverter) { + LiteralConv.ToExecEncodingConverter = + new TextEncodingConverter(std::move(*ErrorOrConverter)); + } else + return ErrorOrConverter.getError(); + return std::error_code(); +} diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp index 1c061528e85f1..daef1ec5d7a0f 100644 --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -134,7 +134,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, FullSourceLoc Loc, unsigned CharWidth, DiagnosticsEngine *Diags, const LangOptions &Features, - StringLiteralEvalMethod EvalMethod) { + StringLiteralEvalMethod EvalMethod, + llvm::TextEncodingConverter *Converter) { const char *EscapeBegin = ThisTokBuf; bool Delimited = false; bool EndDelimiterFound = false; @@ -146,6 +147,8 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, // that would have been \", which would not have been the end of string. unsigned ResultChar = *ThisTokBuf++; char Escape = ResultChar; + bool Transcode = true; + bool Invalid = false; switch (ResultChar) { // These map to themselves. case '\\': case '\'': case '"': case '?': break; @@ -186,6 +189,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, ResultChar = 11; break; case 'x': { // Hex escape. + Transcode = false; ResultChar = 0; if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') { Delimited = true; @@ -249,6 +253,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, case '4': case '5': case '6': case '7': { // Octal escapes. --ThisTokBuf; + Transcode = false; ResultChar = 0; // Octal escapes are a series of octal digits with maximum length 3. @@ -334,6 +339,7 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, << std::string(1, ResultChar); break; default: + Invalid = true; if (!Diags) break; @@ -367,6 +373,15 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, HadError = true; } + if (Transcode && Converter) { + // Invalid escapes are written as '?' and then translated. + char ByteChar = Invalid ? '?' : ResultChar; + SmallString<8> ResultCharConv; + Converter->convert(StringRef(&ByteChar, 1), ResultCharConv); + assert(ResultCharConv.size() == 1 && + "Char size increased after translation"); + ResultChar = ResultCharConv[0]; + } return ResultChar; } @@ -1751,6 +1766,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, HadError = false; Kind = kind; + LiteralConverter LiteralConv = PP.getLiteralConverter(); const char *TokBegin = begin; @@ -1817,6 +1833,10 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, largest_character_for_kind = 0x7Fu; } + llvm::TextEncodingConverter *Converter = nullptr; + if (!isUTFLiteral(Kind) && !isWideLiteral(Kind)) + Converter = LiteralConv.getConverter(CA_ToExecEncoding); + while (begin != end) { // Is this a span of non-escape characters? if (begin[0] != '\\') { @@ -1854,6 +1874,16 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, HadError = true; PP.Diag(Loc, diag::err_character_too_large); } + if (!HadError && Converter) { + assert(Kind != tok::wide_char_constant && + "Wide character translation not supported"); + char ByteChar = *tmp_out_start; + SmallString<1> ConvertedChar; + Converter->convert(StringRef(&ByteChar, 1), ConvertedChar); + assert(ConvertedChar.size() == 1 && + "Char size increased after translation"); + *tmp_out_start = ConvertedChar[0]; + } } } @@ -1861,16 +1891,43 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, } // Is this a Universal Character Name escape? if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') { - unsigned short UcnLen = 0; - if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, - FullSourceLoc(Loc, PP.getSourceManager()), - &PP.getDiagnostics(), PP.getLangOpts(), true)) { - HadError = true; - } else if (*buffer_begin > largest_character_for_kind) { - HadError = true; - PP.Diag(Loc, diag::err_character_too_large); + if (Converter == nullptr) { + unsigned short UcnLen = 0; + if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, + FullSourceLoc(Loc, PP.getSourceManager()), + &PP.getDiagnostics(), PP.getLangOpts(), true)) { + HadError = true; + } else if (*buffer_begin > largest_character_for_kind) { + HadError = true; + PP.Diag(Loc, diag::err_character_too_large); + } + } else { + char Cp[5]; + char *ResultPtr = Cp; + EncodeUCNEscape(TokBegin, begin, end, ResultPtr, HadError, + FullSourceLoc(Loc, PP.getSourceManager()), + /*CharByteWidth=*/1u, &PP.getDiagnostics(), + PP.getLangOpts()); + assert(ResultPtr - Cp <= 4 && + "unexpected result size for UCN escape character"); + if (!HadError) { + SmallString<8> CpConv; + StringRef ToConvert(Cp, ResultPtr - Cp); + std::error_code EC = Converter->convert(StringRef(Cp), CpConv); + if (EC) { + PP.Diag(Loc, diag::err_exec_charset_conversion_failed) + << EC.message(); + HadError = true; + } else { + if (CpConv.size() > 1) { + HadError = true; + PP.Diag(Loc, diag::err_character_too_large); + } else { + *buffer_begin = CpConv[0]; + } + } + } } - ++buffer_begin; continue; } @@ -1879,7 +1936,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, ProcessCharEscape(TokBegin, begin, end, HadError, FullSourceLoc(Loc, PP.getSourceManager()), CharWidth, &PP.getDiagnostics(), PP.getLangOpts(), - StringLiteralEvalMethod::Evaluated); + StringLiteralEvalMethod::Evaluated, nullptr); *buffer_begin++ = result; } @@ -1989,16 +2046,54 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, /// StringLiteralParser::StringLiteralParser(ArrayRef StringToks, Preprocessor &PP, - StringLiteralEvalMethod EvalMethod) + StringLiteralEvalMethod EvalMethod, + ConversionAction Action) : SM(PP.getSourceManager()), Features(PP.getLangOpts()), Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()), - MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), - ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false), - Pascal(false) { - init(StringToks); + LiteralConv(&PP.getLiteralConverter()), MaxTokenLength(0), SizeBound(0), + CharByteWidth(0), Kind(tok::unknown), ResultPtr(ResultBuf.data()), + EvalMethod(EvalMethod), hadError(false), Pascal(false) { + init(StringToks, Action); } -void StringLiteralParser::init(ArrayRef StringToks){ +static char *convertCharactersInPlace(char *ResultPtr, char *ResultPtrBefore, + const unsigned CharByteWidth, + bool &hadError, + llvm::TextEncodingConverter &Converter) { + assert(!hadError && "Unexpected call to convertCharactersInPlace"); + + SmallString<256> CpConv; + int ResultLength = ResultPtr - ResultPtrBefore; + assert(ResultLength % CharByteWidth == 0 && + "Unexpected span of bytes for the characters."); + char *Cp = ResultPtrBefore; + if (Converter.convert(StringRef(Cp, ResultLength / CharByteWidth), CpConv)) { + hadError = true; + return ResultPtr; + } + if (CharByteWidth == 1) { + memcpy(Cp, CpConv.data(), CpConv.size()); + return Cp + CpConv.size(); + } + std::string UTF8String; + if (CharByteWidth == 4) + convertUTF32ToUTF8String(ArrayRef(Cp, ResultLength), UTF8String); + else if (CharByteWidth == 2) + convertUTF16ToUTF8String(ArrayRef(Cp, ResultLength), UTF8String); + if (Converter.convert(UTF8String, CpConv)) { + hadError = true; + return ResultPtr; + } + int NewCharByteWidth = ((int)CpConv.size()) / (ResultLength / CharByteWidth); + unsigned EndianOffset = llvm::sys::IsBigEndianHost ? CharByteWidth - 1 : 0; + for (int i = 0; i < (int)CpConv.size(); i += NewCharByteWidth) + memcpy(Cp + EndianOffset + i * CharByteWidth, CpConv.data() + i, + NewCharByteWidth); + return Cp + CpConv.size() * CharByteWidth; +} + +void StringLiteralParser::init(ArrayRef StringToks, + ConversionAction Action) { // The literal token may have come from an invalid source location (e.g. due // to a PCH error), in which case the token length will be 0. if (StringToks.empty() || StringToks[0].getLength() < 2) @@ -2090,6 +2185,10 @@ void StringLiteralParser::init(ArrayRef StringToks){ SourceLocation UDSuffixTokLoc; + llvm::TextEncodingConverter *Converter = nullptr; + if (!isUTFLiteral(Kind) && !isWideLiteral(Kind) && LiteralConv) + Converter = LiteralConv->getConverter(Action); + for (unsigned i = 0, e = StringToks.size(); i != e; ++i) { const char *ThisTokBuf = &TokenBuf[0]; // Get the spelling of the token, which eliminates trigraphs, etc. We know @@ -2199,10 +2298,20 @@ void StringLiteralParser::init(ArrayRef StringToks){ StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos); StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos); + char *ResultPtrBefore = ResultPtr; // Copy everything before the \r\n sequence into the string literal. if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF)) hadError = true; + if (!hadError && Converter) { + assert(Kind != tok::wide_string_literal && + "Wide character translation not supported"); + ResultPtr = convertCharactersInPlace( + ResultPtr, ResultPtrBefore, CharByteWidth, hadError, *Converter); + if (hadError && Diags) + Diags->Report(StringToks[i].getLocation(), + diag::err_exec_charset_conversion_failed); + } // Point into the \n inside the \r\n sequence and operate on the // remaining portion of the literal. RemainingTokenSpan = AfterCRLF.substr(1); @@ -2237,26 +2346,45 @@ void StringLiteralParser::init(ArrayRef StringToks){ ++ThisTokBuf; } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); + char *ResultPtrBefore = ResultPtr; // Copy the character span over. if (CopyStringFragment(StringToks[i], ThisTokBegin, StringRef(InStart, ThisTokBuf - InStart))) hadError = true; + + if (!hadError && Converter) { + assert(Kind != tok::wide_string_literal && + "Wide character translation not supported"); + ResultPtr = + convertCharactersInPlace(ResultPtr, ResultPtrBefore, + CharByteWidth, hadError, *Converter); + if (hadError && Diags) + Diags->Report(StringToks[i].getLocation(), + diag::err_exec_charset_conversion_failed); + } continue; } // Is this a Universal Character Name escape? if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' || ThisTokBuf[1] == 'N') { - EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, - ResultPtr, hadError, + char *Cp = ResultPtr; + EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, ResultPtr, + hadError, FullSourceLoc(StringToks[i].getLocation(), SM), CharByteWidth, Diags, Features); + if (!hadError && Converter) { + SmallString<8> CpConv; + Converter->convert(StringRef(Cp), CpConv); + memcpy(Cp, CpConv.data(), CpConv.size()); + ResultPtr = Cp + CpConv.size(); + } continue; } // Otherwise, this is a non-UCN escape character. Process it. - unsigned ResultChar = - ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, - FullSourceLoc(StringToks[i].getLocation(), SM), - CharByteWidth * 8, Diags, Features, EvalMethod); + unsigned ResultChar = ProcessCharEscape( + ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, + FullSourceLoc(StringToks[i].getLocation(), SM), CharByteWidth * 8, + Diags, Features, EvalMethod, Converter); if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of @@ -2454,7 +2582,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, } else { ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError, FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8, - Diags, Features, StringLiteralEvalMethod::Evaluated); + Diags, Features, StringLiteralEvalMethod::Evaluated, + nullptr); --ByteNo; } assert(!HadError && "This method isn't valid on erroneous strings"); diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index 6a5e5d4bad3a6..0ce0f230ace3a 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -1557,7 +1557,8 @@ void Preprocessor::HandleLineDirective() { return; } else { // Parse and validate the string, converting it into a unique ID. - StringLiteralParser Literal(StrTok, *this); + StringLiteralParser Literal( + StrTok, *this, StringLiteralEvalMethod::Evaluated, CA_NoConversion); assert(Literal.isOrdinary() && "Didn't allow wide strings in"); if (Literal.hadError) { DiscardUntilEndOfDirective(); @@ -1708,7 +1709,8 @@ void Preprocessor::HandleDigitDirective(Token &DigitTok) { return; } else { // Parse and validate the string, converting it into a unique ID. - StringLiteralParser Literal(StrTok, *this); + StringLiteralParser Literal( + StrTok, *this, StringLiteralEvalMethod::Evaluated, CA_NoConversion); assert(Literal.isOrdinary() && "Didn't allow wide strings in"); if (Literal.hadError) { DiscardUntilEndOfDirective(); diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c new file mode 100644 index 0000000000000..aab43157b1be4 --- /dev/null +++ b/clang/test/CodeGen/systemz-charset.c @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s +// RUN: %clang %s -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s + +const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +// CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00" + +const char *LowerCaseLetters = "abcdefghijklmnopqrstuvwxyz"; +//CHECK: c"\81\82\83\84\85\86\87\88\89\91\92\93\94\95\96\97\98\99\A2\A3\A4\A5\A6\A7\A8\A9\00" + +const char *Digits = "0123456789"; +// CHECK: c"\F0\F1\F2\F3\F4\F5\F6\F7\F8\F9\00" + +const char *SpecialCharacters = " .<(+|&!$*);^-/,%%_>`:#@="; +// CHECK: c"@KLMNOPZ[\\]^_`akllmnyz{|~\00" + +const char *EscapeCharacters = "\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: c"/\16\0C\15\0D\05\0B\E0}\7Fo\00" + +const char *InvalidEscape = "\y\z"; +//CHECK: c"oo\00" + +const char *HexCharacters = "\x12\x13\x14"; +//CHECK: c"\12\13\14\00" + +const char *OctalCharacters = "\141\142\143"; +//CHECK: c"abc\00" + +const char singleChar = 'a'; +//CHECK: i8 -127 + +const char *UcnCharacters = "\u00E2\u00AC\U000000DF"; +//CHECK: c"B\B0Y\00" + +const char *Unicode = "ΓΏ"; +//CHECK: c"\DF\00" diff --git a/clang/test/CodeGen/systemz-charset.cpp b/clang/test/CodeGen/systemz-charset.cpp new file mode 100644 index 0000000000000..7e66407fd2ff1 --- /dev/null +++ b/clang/test/CodeGen/systemz-charset.cpp @@ -0,0 +1,46 @@ +// RUN: %clang %s -std=c++17 -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s + +const char *RawString = R"(Hello\n)"; +//CHECK: c"\C8\85\93\93\96\E0\95\00" + +const char *MultiLineRawString = R"( +Hello +There)"; +//CHECK: c"\15\C8\85\93\93\96\15\E3\88\85\99\85\00" + +char UnicodeChar8 = u8'1'; +//CHECK: i8 49 +char16_t UnicodeChar16 = u'1'; +//CHECK: i16 49 +char32_t UnicodeChar32 = U'1'; +//CHECK: i32 49 + +const char *EscapeCharacters8 = u8"\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: c"\07\08\0C\0A\0D\09\0B\\'\22?\00" + +const char16_t *EscapeCharacters16 = u"\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: [12 x i16] [i16 7, i16 8, i16 12, i16 10, i16 13, i16 9, i16 11, i16 92, i16 39, i16 34, i16 63, i16 0] + +const char32_t *EscapeCharacters32 = U"\a\b\f\n\r\t\v\\\'\"\?"; +//CHECK: [12 x i32] [i32 7, i32 8, i32 12, i32 10, i32 13, i32 9, i32 11, i32 92, i32 39, i32 34, i32 63, i32 0] + +const char *UnicodeString8 = u8"Hello"; +//CHECK: c"Hello\00" +const char16_t *UnicodeString16 = u"Hello"; +//CHECK: [6 x i16] [i16 72, i16 101, i16 108, i16 108, i16 111, i16 0] +const char32_t *UnicodeString32 = U"Hello"; +//CHECK: [6 x i32] [i32 72, i32 101, i32 108, i32 108, i32 111, i32 0] + +const char *UnicodeRawString8 = u8R"("Hello\")"; +//CHECK: c"\22Hello\\\22\00" +const char16_t *UnicodeRawString16 = uR"("Hello\")"; +//CHECK: [9 x i16] [i16 34, i16 72, i16 101, i16 108, i16 108, i16 111, i16 92, i16 34, i16 0] +const char32_t *UnicodeRawString32 = UR"("Hello\")"; +//CHECK: [9 x i32] [i32 34, i32 72, i32 101, i32 108, i32 108, i32 111, i32 92, i32 34, i32 0] + +const char *UnicodeUCNString8 = u8"\u00E2\u00AC\U000000DF"; +//CHECK: c"\C3\A2\C2\AC\C3\9F\00" +const char16_t *UnicodeUCNString16 = u"\u00E2\u00AC\U000000DF"; +//CHECK: [4 x i16] [i16 226, i16 172, i16 223, i16 0] +const char32_t *UnicodeUCNString32 = U"\u00E2\u00AC\U000000DF"; +//CHECK: [4 x i32] [i32 226, i32 172, i32 223, i32 0] diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c index 1b1169b71554a..29202d7b1a266 100644 --- a/clang/test/Driver/cl-options.c +++ b/clang/test/Driver/cl-options.c @@ -250,10 +250,11 @@ // RUN: not %clang_cl /source-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=source-charset-utf-16 %s // source-charset-utf-16: invalid value 'utf-16' in '/source-charset:utf-16' -// /execution-charset: should warn on everything except UTF-8. -// RUN: not %clang_cl /execution-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-utf-16 %s -// execution-charset-utf-16: invalid value 'utf-16' in '/execution-charset:utf-16' +// /execution-charset: should warn on invalid charsets. +// RUN: not %clang_cl /execution-charset:invalid-charset -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-invalid %s +// execution-charset-invalid: invalid value 'invalid-charset' in '/execution-charset:invalid-charset' // + // RUN: %clang_cl /Umymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s // RUN: %clang_cl /U mymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s // U: "-U" "mymacro" diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c index 765f9d6ae3212..6792ad96398a4 100644 --- a/clang/test/Driver/clang_f_opts.c +++ b/clang/test/Driver/clang_f_opts.c @@ -232,8 +232,16 @@ // RUN: not %clang -### -S -finput-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s // CHECK-INVALID-INPUT-CHARSET: error: invalid value 'iso-8859-1' in '-finput-charset=iso-8859-1' -// RUN: not %clang -### -S -fexec-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-EXEC-CHARSET %s -// CHECK-INVALID-EXEC-CHARSET: error: invalid value 'iso-8859-1' in '-fexec-charset=iso-8859-1' +// RUN: not %clang -### -S -fexec-charset=invalid-charset -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-EXEC-CHARSET %s +// CHECK-INVALID-EXEC-CHARSET: error: invalid value 'invalid-charset' in '-fexec-charset=invalid-charset' + +// Test that we support the following exec charsets. The preferred MIME name is +// `IBM1047`, but `IBM-1047` is the name used by z/OS USS utilities such as +// `chtag`. +// RUN: %clang -### -S -fexec-charset=UTF-8 -o /dev/null %s 2>&1 | FileCheck --check-prefix=CHECK-EXEC-CHARSET-UTF-8 %s +// RUN: %clang -### -S -fexec-charset=IBM-1047 -o /dev/null %s 2>&1 | FileCheck --check-prefix=CHECK-EXEC-CHARSET-IBM-1047 %s +// CHECK-EXEC-CHARSET-UTF-8: "-fexec-charset" "UTF-8" +// CHECK-EXEC-CHARSET-IBM-1047: "-fexec-charset" "IBM-1047" // Test that we don't error on these. // RUN: not %clang -### -S -Werror \ @@ -247,7 +255,7 @@ // RUN: -fident -fno-ident \ // RUN: -fimplicit-templates -fno-implicit-templates \ // RUN: -finput-charset=UTF-8 \ -// RUN: -fexec-charset=UTF-8 \ +// RUN: -fexec-charset=UTF-8 \ // RUN: -fivopts -fno-ivopts \ // RUN: -fnon-call-exceptions -fno-non-call-exceptions \ // RUN: -fpermissive -fno-permissive \ diff --git a/clang/test/Preprocessor/init-s390x.c b/clang/test/Preprocessor/init-s390x.c index a8fbde46cbb75..9ff122def913f 100644 --- a/clang/test/Preprocessor/init-s390x.c +++ b/clang/test/Preprocessor/init-s390x.c @@ -206,4 +206,5 @@ // S390X-ZOS: #define __TOS_390__ 1 // S390X-ZOS: #define __TOS_MVS__ 1 // S390X-ZOS: #define __XPLINK__ 1 +// S390X-ZOS: #define __clang_literal_encoding__ IBM-1047 // S390X-ZOS-GNUXX: #define __wchar_t 1 diff --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h index 8a304910aa5dd..bda6f2a088eb2 100644 --- a/llvm/include/llvm/Support/TextEncoding.h +++ b/llvm/include/llvm/Support/TextEncoding.h @@ -135,6 +135,9 @@ class TextEncodingConverter { return std::string(Result); return EC; } + + // Maps the encoding name to enum constant if possible. + static std::optional getKnownEncoding(StringRef Name); }; } // namespace llvm diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 0e82dd212f34d..1e40841ffbb1c 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -507,6 +507,9 @@ class Triple { /// For example, "fooos1.2.3" would return "1.2.3". LLVM_ABI StringRef getEnvironmentVersionString() const; + /// getDefaultNarrowTextEncoding - Get the default encoding of the triple. + StringRef getDefaultNarrowTextEncoding() const; + /// @} /// @name Convenience Predicates /// @{ diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp index 41f51877d7128..b7d73ff5b8412 100644 --- a/llvm/lib/Support/TextEncoding.cpp +++ b/llvm/lib/Support/TextEncoding.cpp @@ -44,7 +44,8 @@ static void normalizeCharSetName(StringRef CSName, } // Maps the encoding name to enum constant if possible. -static std::optional getKnownEncoding(StringRef Name) { +std::optional +TextEncodingConverter::getKnownEncoding(StringRef Name) { SmallString<16> Normalized; normalizeCharSetName(Name, Normalized); if (Normalized.equals("utf8")) diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index 11ba9ee32f66a..21a08a31265a5 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -1419,6 +1419,13 @@ StringRef Triple::getOSAndEnvironmentName() const { return Tmp.split('-').second; // Strip second component } +// Default encoding on z/OS is IBM-1047 and UTF-8 otherwise +StringRef Triple::getDefaultNarrowTextEncoding() const { + if (getOS() == llvm::Triple::ZOS) + return "IBM-1047"; + return "UTF-8"; +} + static VersionTuple parseVersionFromName(StringRef Name) { VersionTuple Version; Version.tryParse(Name);