Skip to content

Commit db025cf

Browse files
committed
Re-implement support for UTF-8 encoded source files with signature BOM.
1 parent fa25626 commit db025cf

File tree

8 files changed

+111
-12
lines changed

8 files changed

+111
-12
lines changed

source/parser/parser.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,7 @@ class Parser : public SceneTask
420420
void pre_init_tokenizer (void);
421421
void Initialize_Tokenizer (void);
422422
void Terminate_Tokenizer (void);
423+
void CheckFileSignature();
423424
SYM_ENTRY *Add_Symbol (SYM_TABLE *table, const char *Name,TokenId Number);
424425
SYM_ENTRY *Add_Symbol (int Index,const char *Name,TokenId Number);
425426
POV_ARRAY *Parse_Array_Declare (void);

source/parser/parser_tokenizer.cpp

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,9 @@ void Parser::Initialize_Tokenizer()
9595
shared_ptr<IStream> rfile;
9696
UCS2String actualFileName;
9797

98-
pre_init_tokenizer ();
98+
pre_init_tokenizer();
9999

100-
rfile = Locate_File(sceneData->inputFile.c_str(),POV_File_Text_POV,actualFileName,true);
100+
rfile = Locate_File(sceneData->inputFile.c_str(), POV_File_Text_POV, actualFileName, true);
101101
if (rfile == nullptr)
102102
Error("Cannot open input file.");
103103

@@ -106,7 +106,7 @@ void Parser::Initialize_Tokenizer()
106106

107107
mHavePendingRawToken = false;
108108

109-
Got_EOF = false;
109+
Got_EOF = false;
110110

111111
/* Init conditional stack. */
112112

@@ -118,17 +118,33 @@ void Parser::Initialize_Tokenizer()
118118
Max_Trace_Level = MAX_TRACE_LEVEL_DEFAULT;
119119
Had_Max_Trace_Level = false;
120120

121-
/// @todo Re-enable UTF-8 signature BOM handling.
122-
#if 0
123-
/* ignore any leading characters if they have character codes above 127, this
124-
takes care of UTF-8 files with encoding info at the beginning of the file */
125-
for(c = Echo_getc(); c > 127; c = Echo_getc())
126-
sceneData->stringEncoding = kStringEncoding_UTF8; // switch to UTF-8 automatically [trf]
127-
Echo_ungetc(c);
128-
#endif
121+
CheckFileSignature();
129122
}
130123

131124

125+
//******************************************************************************
126+
127+
128+
void Parser::CheckFileSignature()
129+
{
130+
RawToken signature;
131+
if (GetRawToken(signature, false))
132+
{
133+
if (signature.expressionId == SIGNATURE_FUNCT_TOKEN)
134+
{
135+
// Found a signature. Switch to the corresponding encoding automatically.
136+
///@todo Still need to work on the mechanism to handle string encoding.
137+
switch (signature.id)
138+
{
139+
case UTF8_SIGNATURE_TOKEN: /* sceneData->stringEncoding = kStringEncoding_UTF8; */ break;
140+
default: POV_PARSER_ASSERT(false); break;
141+
}
142+
}
143+
else
144+
UngetRawToken(signature);
145+
}
146+
}
147+
132148

133149
/*****************************************************************************
134150
*
@@ -3512,6 +3528,8 @@ void Parser::IncludeHeader(const UCS2String& formalFileName)
35123528
mToken.is_array_elem = false;
35133529
mToken.is_mixed_array_elem = false;
35143530
mToken.is_dictionary_elem = false;
3531+
3532+
CheckFileSignature();
35153533
}
35163534

35173535
}

source/parser/rawtokenizer.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ bool RawTokenizer::GetNextToken(RawToken& token)
126126
case Lexeme::kFloatLiteral: if (ProcessFloatLiteralLexeme(token)) return true;
127127
case Lexeme::kStringLiteral: if (ProcessStringLiteralLexeme(token)) return true;
128128
case Lexeme::kOther: if (ProcessOtherLexeme(token)) return true;
129+
case Lexeme::kUTF8SignatureBOM: if (ProcessSignatureLexeme(token)) return true;
129130
default: POV_PARSER_ASSERT(false); return true;
130131
}
131132
}
@@ -389,9 +390,30 @@ bool RawTokenizer::ProcessOtherLexeme(RawToken& token)
389390
return true;
390391
}
391392

393+
bool RawTokenizer::ProcessSignatureLexeme(RawToken& token)
394+
{
395+
POV_PARSER_ASSERT(token.lexeme.text.size() > 0);
396+
397+
TokenId tokenId = NOT_A_TOKEN;
398+
399+
switch (token.lexeme.category)
400+
{
401+
case Lexeme::kUTF8SignatureBOM: tokenId = UTF8_SIGNATURE_TOKEN; break;
402+
default: POV_PARSER_ASSERT(false); return false;
403+
}
404+
405+
token.id = int(tokenId);
406+
token.expressionId = GetExpressionId(tokenId);
407+
token.value = nullptr;
408+
409+
return true;
410+
}
411+
392412
TokenId pov_parser::RawTokenizer::GetExpressionId(TokenId tokenId)
393413
{
394-
if (tokenId <= FLOAT_FUNCT_TOKEN)
414+
if (tokenId <= SIGNATURE_FUNCT_TOKEN)
415+
return SIGNATURE_FUNCT_TOKEN;
416+
else if (tokenId <= FLOAT_FUNCT_TOKEN)
395417
return FLOAT_FUNCT_TOKEN;
396418
else if (tokenId <= VECTOR_FUNCT_TOKEN)
397419
return VECTOR_FUNCT_TOKEN;

source/parser/rawtokenizer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ class RawTokenizer
241241
bool ProcessOtherLexeme(RawToken& token);
242242
bool ProcessFloatLiteralLexeme(RawToken& token);
243243
bool ProcessStringLiteralLexeme(RawToken& token);
244+
bool ProcessSignatureLexeme(RawToken& token);
244245

245246
bool ProcessUCSEscapeDigits(UCS4& c, UTF8String::const_iterator& i, UTF8String::const_iterator& escapeSequenceEnd, unsigned int digits);
246247

source/parser/reservedwords.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,7 @@ const RESERVED_WORD Reserved_Words[] = {
672672

673673
{ COLOUR_KEY_TOKEN, "color keyword" },
674674
{ FLOAT_FUNCT_TOKEN, "float function" },
675+
{ SIGNATURE_FUNCT_TOKEN, "signature function" },
675676
{ VECTOR_FUNCT_TOKEN, "vector function" },
676677

677678
//------------------------------------------------------------------------------
@@ -726,6 +727,7 @@ const RESERVED_WORD Reserved_Words[] = {
726727
{ END_OF_FILE_TOKEN, "End of File" },
727728
{ FLOAT_TOKEN, "float constant" },
728729
{ STRING_LITERAL_TOKEN, "string literal" },
730+
{ UTF8_SIGNATURE_TOKEN, "UTF-8 signature BOM" },
729731

730732
//------------------------------------------------------------------------------
731733
// End of list, marked by TokenId TOKEN_COUNT_ and nullptr token string.

source/parser/reservedwords.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,20 @@ struct Reserved_Word_Struct
5656
// Token Definitions for Parser
5757
enum TokenId : int
5858
{
59+
//------------------------------------------------------------------------------
60+
// Signature Tokens.
61+
//
62+
// All tokens that indicate a file format must go here.
63+
//
64+
// Please keep this section neatly sorted by the token identifier name,
65+
// sorting underscore characters before digits, digits before letters,
66+
// and short names before long ones, but _ignoring_ the trailing `_TOKEN` or
67+
// `_ID_TOKEN`.
68+
69+
UTF8_SIGNATURE_TOKEN,
70+
71+
SIGNATURE_FUNCT_TOKEN, // must be last in this section
72+
5973
//------------------------------------------------------------------------------
6074
// Float Tokens.
6175
//

source/parser/scanner.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
// C++ variants of C standard header files
4040
#include <cstdio>
4141
#include <cstdlib>
42+
#include <cstring>
4243

4344
// C++ standard header files
4445
#include <limits>
@@ -396,6 +397,16 @@ bool Scanner::GetNextLexeme(Lexeme& lexeme)
396397
if (mpSource == nullptr)
397398
return false;
398399

400+
if ((mpNextChar == maBuffer) && (mBase == 0))
401+
{
402+
// At the very start of the stream.
403+
// Check for file format signatures.
404+
405+
// Currently, only UTF-8 is supported.
406+
if (GetNextSignatureLexeme(lexeme, Lexeme::kUTF8SignatureBOM, u8"\uFEFF"))
407+
return true;
408+
}
409+
399410
while (!mEndOfStream)
400411
{
401412
// Skip over any whitespace (including blank lines).
@@ -758,6 +769,32 @@ bool Scanner::EatNextBlockComment()
758769

759770
//------------------------------------------------------------------------------
760771

772+
bool Scanner::GetNextSignatureLexeme(Lexeme& lexeme, Lexeme::Category sigId, const Octet* sigToTest, size_t sigLength)
773+
{
774+
POV_PARSER_ASSERT(!mEndOfStream);
775+
POV_PARSER_ASSERT(mBase == 0);
776+
POV_PARSER_ASSERT(mpNextChar == maBuffer);
777+
778+
if ((mpBufferEnd - mpNextChar) < sigLength)
779+
return false;
780+
if (std::memcmp(mpNextChar, sigToTest, sigLength) != 0)
781+
return false;
782+
783+
lexeme.text = UTF8String(reinterpret_cast<const char*>(sigToTest), sigLength);
784+
lexeme.position = mCurrentPosition;
785+
lexeme.category = sigId;
786+
mpNextChar += sigLength;
787+
mCurrentPosition.offset += sigLength;
788+
return true;
789+
}
790+
791+
bool Scanner::GetNextSignatureLexeme(Lexeme& lexeme, Lexeme::Category sigId, const char* sigToTest)
792+
{
793+
return GetNextSignatureLexeme(lexeme, sigId, reinterpret_cast<const Octet*>(sigToTest), std::strlen(sigToTest));
794+
}
795+
796+
//------------------------------------------------------------------------------
797+
761798
bool Scanner::GetRaw(unsigned char* buffer, size_t size)
762799
{
763800
POV_PARSER_ASSERT(!mEndOfStream);

source/parser/scanner.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ struct Lexeme
7575
kFloatLiteral,
7676
kStringLiteral,
7777
kOther,
78+
kUTF8SignatureBOM,
7879
};
7980
UTF8String text;
8081
LexemePosition position;
@@ -224,6 +225,9 @@ class Scanner
224225
void EatNextLineComment();
225226
bool EatNextBlockComment();
226227

228+
bool GetNextSignatureLexeme(Lexeme& lexeme, Lexeme::Category sigId, const Octet* sigToTest, size_t sigLength);
229+
bool GetNextSignatureLexeme(Lexeme& lexeme, Lexeme::Category sigId, const char* sigToTest);
230+
227231
/// Copy character to lexeme, then advance stream.
228232
/// @return `true` if another character is available.
229233
bool CopyAndAdvance(Lexeme& lexeme);

0 commit comments

Comments
 (0)