Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions clang/docs/LanguageExtensions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -419,9 +419,9 @@ Builtin Macros

``__clang_literal_encoding__``
Defined to a narrow string literal that represents the current encoding of
narrow string literals, e.g., ``"hello"``. This macro typically expands to
"UTF-8" (but may change in the future if the
``-fexec-charset="Encoding-Name"`` option is implemented.)
narrow string literals, e.g., ``"hello"``. This macro expands to the text
encoding specified by ``-fexec-charset`` if any, or a system-specific default
otherwise: ``"IBM-1047"`` on z/OS and ``"UTF-8"`` on all other systems.

``__clang_wide_literal_encoding__``
Defined to a narrow string literal that represents the current encoding of
Expand Down
3 changes: 2 additions & 1 deletion clang/include/clang/Basic/DiagnosticFrontendKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,8 @@ def err_non_default_visibility_dllimport : Error<
"non-default visibility cannot be applied to 'dllimport' declaration">;
def err_ifunc_resolver_return : Error<
"ifunc resolver function must return a pointer">;

def err_fe_literal_conv_config : Error<
"failed to configure the literal converter">;
def warn_atomic_op_misaligned : Warning<
"misaligned atomic operation may incur "
"significant performance penalty"
Expand Down
2 changes: 2 additions & 0 deletions clang/include/clang/Basic/DiagnosticLexKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,8 @@ def ext_string_too_long : Extension<"string literal of length %0 exceeds "
"support">, InGroup<OverlengthStrings>;
def err_character_too_large : Error<
"character too large for enclosing character literal type">;
def err_exec_charset_conversion_failed : Error<
"conversion to execution encoding failed: '%0'">;
def warn_c99_compat_unicode_literal : Warning<
"unicode literals are incompatible with C99">,
InGroup<C99Compat>, DefaultIgnore;
Expand Down
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/LangOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,9 @@ class LangOptions : public LangOptionsBase {
/// The allocation token mode.
std::optional<llvm::AllocTokenMode> AllocTokenMode;

/// Name of the execution encoding to convert the internal encoding to.
std::string ExecEncoding;

LangOptions();

/// Set language defaults for the given input language and
Expand Down
12 changes: 12 additions & 0 deletions clang/include/clang/Basic/TokenKinds.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,18 @@ inline bool isLiteral(TokenKind K) {
return isInLiteralRange;
}

/// Return true if this is a UTF literal kind.
inline bool isUTFLiteral(TokenKind K) {
return K == tok::utf8_char_constant || K == tok::utf8_string_literal ||
K == tok::utf16_char_constant || K == tok::utf16_string_literal ||
K == tok::utf32_char_constant || K == tok::utf32_string_literal;
}

/// Return true if this is a wide literal kind.
inline bool isWideLiteral(TokenKind K) {
return K == tok::wide_char_constant || K == tok::wide_string_literal;
}

/// Return true if this is any of tok::annot_* kinds.
bool isAnnotation(TokenKind K);

Expand Down
14 changes: 12 additions & 2 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -2967,7 +2967,10 @@ def fexperimental_strict_floating_point : Flag<["-"], "fexperimental-strict-floa
def finput_charset_EQ : Joined<["-"], "finput-charset=">,
Visibility<[ClangOption, FlangOption, FC1Option]>, Group<f_Group>,
HelpText<"Specify the default character set for source files">;
def fexec_charset_EQ : Joined<["-"], "fexec-charset=">, Group<f_Group>;
def fexec_charset_EQ : Joined<["-"], "fexec-charset=">, Group<f_Group>,
HelpText<"Set the execution <charset> for string and character literals. "
"Supported character encodings include ISO-8859-1, UTF-8, IBM1047, "
"and possibly those supported by ICU or the host iconv library.">;
def finstrument_functions
: Flag<["-"], "finstrument-functions">,
Group<f_Group>,
Expand Down Expand Up @@ -7473,6 +7476,11 @@ let Visibility = [CC1Option, CC1AsOption, FC1Option] in {
def tune_cpu : Separate<["-"], "tune-cpu">,
HelpText<"Tune for a specific cpu type">,
MarshallingInfoString<TargetOpts<"TuneCPU">>;
def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<charset>">,
HelpText<"Set the execution <charset> for string and character literals. "
"Supported character encodings include ISO-8859-1, UTF-8, IBM1047, "
"and possibly those supported by ICU or the host iconv library.">,
MarshallingInfoString<LangOpts<"ExecEncoding">>;
def target_cpu : Separate<["-"], "target-cpu">,
HelpText<"Target a specific cpu type">,
MarshallingInfoString<TargetOpts<"CPU">>;
Expand Down Expand Up @@ -9078,7 +9086,9 @@ def _SLASH_source_charset : CLCompileJoined<"source-charset:">,
HelpText<"Set source encoding, supports only UTF-8">,
Alias<finput_charset_EQ>;
def _SLASH_execution_charset : CLCompileJoined<"execution-charset:">,
HelpText<"Set runtime encoding, supports only UTF-8">,
HelpText<"Set the execution <charset> for string and character literals. "
"Supported character encodings include ISO-8859-1, UTF-8, IBM1047, "
"and possibly those supported by ICU or the host iconv library.">,
Alias<fexec_charset_EQ>;
def _SLASH_std : CLCompileJoined<"std:">,
HelpText<"Set language version (c++14,c++17,c++20,c++23preview,c++latest,c11,c17)">;
Expand Down
40 changes: 40 additions & 0 deletions clang/include/clang/Lex/LiteralConverter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
//===--- clang/Lex/LiteralConverter.h - Translator for Literals -*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CLANG_LEX_LITERALCONVERTER_H
#define LLVM_CLANG_LEX_LITERALCONVERTER_H

#include "clang/Basic/Diagnostic.h"
#include "clang/Basic/LangOptions.h"
#include "clang/Basic/TargetInfo.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/TextEncoding.h"

enum ConversionAction {
CA_NoConversion,
CA_ToSystemEncoding,
CA_ToExecEncoding
};

class LiteralConverter {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This class seems misnamed. It's not a converter, it represents the conversion configuration.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would be a better name? TextEncodingConfig ? LiteralConfig?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, TextEncodingConfig sounds like it would work.

llvm::StringRef InternalEncoding;
llvm::StringRef SystemEncoding;
llvm::StringRef ExecEncoding;
llvm::TextEncodingConverter *ToSystemEncodingConverter = nullptr;
llvm::TextEncodingConverter *ToExecEncodingConverter = nullptr;

public:
llvm::TextEncodingConverter *getConverter(ConversionAction Action);
static std::error_code
setConvertersFromOptions(LiteralConverter &LiteralConv,
const clang::LangOptions &Opts,
const clang::TargetInfo &TInfo);
};

#endif
19 changes: 11 additions & 8 deletions clang/include/clang/Lex/LiteralSupport.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/LLVM.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Lex/LiteralConverter.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/DataTypes.h"

#include "llvm/Support/TextEncoding.h"
namespace clang {

class DiagnosticsEngine;
Expand Down Expand Up @@ -233,6 +234,7 @@ class StringLiteralParser {
const LangOptions &Features;
const TargetInfo &Target;
DiagnosticsEngine *Diags;
LiteralConverter *LiteralConv;

unsigned MaxTokenLength;
unsigned SizeBound;
Expand All @@ -246,18 +248,19 @@ class StringLiteralParser {
StringLiteralEvalMethod EvalMethod;

public:
StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP,
StringLiteralEvalMethod StringMethod =
StringLiteralEvalMethod::Evaluated);
StringLiteralParser(
ArrayRef<Token> StringToks, Preprocessor &PP,
StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated,
ConversionAction Action = CA_ToExecEncoding);
StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
const LangOptions &features, const TargetInfo &target,
DiagnosticsEngine *diags = nullptr)
: SM(sm), Features(features), Target(target), Diags(diags),
MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
ResultPtr(ResultBuf.data()),
LiteralConv(nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0),
Kind(tok::unknown), ResultPtr(ResultBuf.data()),
EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false),
Pascal(false) {
init(StringToks);
init(StringToks, CA_NoConversion);
}

bool hadError;
Expand Down Expand Up @@ -305,7 +308,7 @@ class StringLiteralParser {
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix);

private:
void init(ArrayRef<Token> StringToks);
void init(ArrayRef<Token> StringToks, ConversionAction Action);
bool CopyStringFragment(const Token &Tok, const char *TokBegin,
StringRef Fragment);
void DiagnoseLexingError(SourceLocation Loc);
Expand Down
3 changes: 3 additions & 0 deletions clang/include/clang/Lex/Preprocessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "clang/Basic/TokenKinds.h"
#include "clang/Lex/HeaderSearch.h"
#include "clang/Lex/Lexer.h"
#include "clang/Lex/LiteralConverter.h"
#include "clang/Lex/MacroInfo.h"
#include "clang/Lex/ModuleLoader.h"
#include "clang/Lex/ModuleMap.h"
Expand Down Expand Up @@ -163,6 +164,7 @@ class Preprocessor {
std::unique_ptr<ScratchBuffer> ScratchBuf;
HeaderSearch &HeaderInfo;
ModuleLoader &TheModuleLoader;
LiteralConverter LiteralConv;

/// External source of macros.
ExternalPreprocessorSource *ExternalSource;
Expand Down Expand Up @@ -1235,6 +1237,7 @@ class Preprocessor {
SelectorTable &getSelectorTable() { return Selectors; }
Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
LiteralConverter &getLiteralConverter() { return LiteralConv; }

void setExternalSource(ExternalPreprocessorSource *Source) {
ExternalSource = Source;
Expand Down
25 changes: 19 additions & 6 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Process.h"
#include "llvm/Support/TextEncoding.h"
#include "llvm/Support/YAMLParser.h"
#include "llvm/TargetParser/AArch64TargetParser.h"
#include "llvm/TargetParser/ARMTargetParserCommon.h"
Expand Down Expand Up @@ -7363,12 +7364,24 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
<< value;
}

// -fexec_charset=UTF-8 is default. Reject others
if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
StringRef value = execCharset->getValue();
if (!value.equals_insensitive("utf-8"))
D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args)
<< value;
if (Arg *execEncoding = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
StringRef value = execEncoding->getValue();
bool KnownEncoding =
llvm::TextEncodingConverter::getKnownEncoding(value).has_value();
if (!KnownEncoding) {
llvm::ErrorOr<llvm::TextEncodingConverter> ErrorOrConverter =
llvm::TextEncodingConverter::create("UTF-8", value.data());
if (!ErrorOrConverter)
D.Diag(diag::err_drv_invalid_value)
<< execEncoding->getAsString(Args) << value;
}
CmdArgs.push_back("-fexec-charset");
CmdArgs.push_back(Args.MakeArgString(value));
} else {
// Set the default fexec-charset as the system charset.
CmdArgs.push_back("-fexec-charset");
CmdArgs.push_back(
Args.MakeArgString(Triple.getDefaultNarrowTextEncoding()));
}

RenderDiagnosticsOptions(D, Args, CmdArgs);
Expand Down
5 changes: 5 additions & 0 deletions clang/lib/Frontend/CompilerInstance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "clang/Frontend/Utils.h"
#include "clang/Frontend/VerifyDiagnosticConsumer.h"
#include "clang/Lex/HeaderSearch.h"
#include "clang/Lex/LiteralConverter.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/PreprocessorOptions.h"
#include "clang/Sema/CodeCompleteConsumer.h"
Expand Down Expand Up @@ -541,6 +542,10 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {

if (GetDependencyDirectives)
PP->setDependencyDirectivesGetter(*GetDependencyDirectives);

if (LiteralConverter::setConvertersFromOptions(PP->getLiteralConverter(),
getLangOpts(), getTarget()))
PP->getDiagnostics().Report(clang::diag::err_fe_literal_conv_config);
}

std::string CompilerInstance::getSpecificModuleCachePath(StringRef ModuleHash) {
Expand Down
4 changes: 3 additions & 1 deletion clang/lib/Frontend/FrontendAction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,9 @@ static SourceLocation ReadOriginalFileName(CompilerInstance &CI,
if (T.isAtStartOfLine() || T.getKind() != tok::string_literal)
return SourceLocation();

StringLiteralParser Literal(T, CI.getPreprocessor());
StringLiteralParser Literal(T, CI.getPreprocessor(),
StringLiteralEvalMethod::Evaluated,
CA_NoConversion);
if (Literal.hadError)
return SourceLocation();
RawLexer->LexFromRawLexer(T);
Expand Down
12 changes: 8 additions & 4 deletions clang/lib/Frontend/InitPreprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1022,10 +1022,14 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
}
}

// Macros to help identify the narrow and wide character sets
// FIXME: clang currently ignores -fexec-charset=. If this changes,
// then this may need to be updated.
Builder.defineMacro("__clang_literal_encoding__", "\"UTF-8\"");
// Macros to help identify the narrow and wide character sets. This is set
// to fexec-charset. If fexec-charset is not specified, the default is the
// system charset.
if (!LangOpts.ExecEncoding.empty())
Builder.defineMacro("__clang_literal_encoding__", LangOpts.ExecEncoding);
else
Builder.defineMacro("__clang_literal_encoding__",
TI.getTriple().getDefaultNarrowTextEncoding());
if (TI.getTypeWidth(TI.getWCharType()) >= 32) {
// FIXME: 32-bit wchar_t signals UTF-32. This may change
// if -fwide-exec-charset= is ever supported.
Expand Down
1 change: 1 addition & 0 deletions clang/lib/Lex/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ add_clang_library(clangLex
InitHeaderSearch.cpp
Lexer.cpp
LexHLSLRootSignature.cpp
LiteralConverter.cpp
LiteralSupport.cpp
MacroArgs.cpp
MacroInfo.cpp
Expand Down
60 changes: 60 additions & 0 deletions clang/lib/Lex/LiteralConverter.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
//===--- LiteralConverter.cpp - Translator for String Literals -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "clang/Lex/LiteralConverter.h"
#include "clang/Basic/DiagnosticDriver.h"

using namespace llvm;

llvm::TextEncodingConverter *
LiteralConverter::getConverter(ConversionAction Action) {
if (Action == CA_ToSystemEncoding)
return ToSystemEncodingConverter;
else if (Action == CA_ToExecEncoding)
return ToExecEncodingConverter;
else
return nullptr;
}

std::error_code
LiteralConverter::setConvertersFromOptions(LiteralConverter &LiteralConv,
const clang::LangOptions &Opts,
const clang::TargetInfo &TInfo) {
using namespace llvm;
LiteralConv.InternalEncoding = "UTF-8";
LiteralConv.SystemEncoding = TInfo.getTriple().getDefaultNarrowTextEncoding();
LiteralConv.ExecEncoding = Opts.ExecEncoding.empty()
? LiteralConv.InternalEncoding
: Opts.ExecEncoding;

// Create converter between internal and system encoding
if (LiteralConv.InternalEncoding != LiteralConv.SystemEncoding) {
ErrorOr<TextEncodingConverter> ErrorOrConverter =
llvm::TextEncodingConverter::create(LiteralConv.InternalEncoding,
LiteralConv.SystemEncoding);
if (ErrorOrConverter) {
LiteralConv.ToSystemEncodingConverter =
new TextEncodingConverter(std::move(*ErrorOrConverter));
} else
return ErrorOrConverter.getError();
}

// Create converter between internal and exec encoding specified
// in fexec-charset option.
if (LiteralConv.InternalEncoding == LiteralConv.ExecEncoding)
return std::error_code();
ErrorOr<TextEncodingConverter> ErrorOrConverter =
llvm::TextEncodingConverter::create(LiteralConv.InternalEncoding,
LiteralConv.ExecEncoding);
if (ErrorOrConverter) {
LiteralConv.ToExecEncodingConverter =
new TextEncodingConverter(std::move(*ErrorOrConverter));
} else
return ErrorOrConverter.getError();
return std::error_code();
}
Loading