Skip to content

Commit 0a67cf1

Browse files
committed
Merge branch 'refactor/tokenizer' into autobuild/tokenizer
2 parents 91c9449 + db025cf commit 0a67cf1

File tree

9 files changed

+130
-31
lines changed

9 files changed

+130
-31
lines changed

source/base/version.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@
100100
/// where `N` is a serial number starting at 1 in each phase, `TIME` is the number of minutes
101101
/// since 2000-01-01 00:00, and `FEATURE` is an arbitrary alphanumeric moniker for a particular
102102
/// experimental feature.
103-
#define POV_RAY_PRERELEASE "x.tokenizer.9684878"
103+
#define POV_RAY_PRERELEASE "x.tokenizer.9686180"
104104

105105
#if defined(DOXYGEN) && !defined(POV_RAY_PRERELEASE)
106106
// Work around doxygen being unable to document undefined macros.

source/parser/parser.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ class Parser : public SceneTask
241241
TokenId Function_Id; ///< token type ID, in case Token_Id is an identifier ID
242242
int context; ///< context the token is local to (i.e., table index)
243243
DBL Token_Float; ///< token value (if it is a float literal)
244-
int Unget_Token, End_Of_File;
244+
bool Unget_Token, End_Of_File;
245245
void *Data; ///< reference to token value (if it is a non-float identifier)
246246
TokenId *NumberPtr;
247247
void **DataPtr;
@@ -420,6 +420,7 @@ class Parser : public SceneTask
420420
void pre_init_tokenizer (void);
421421
void Initialize_Tokenizer (void);
422422
void Terminate_Tokenizer (void);
423+
void CheckFileSignature();
423424
SYM_ENTRY *Add_Symbol (SYM_TABLE *table, const char *Name,TokenId Number);
424425
SYM_ENTRY *Add_Symbol (int Index,const char *Name,TokenId Number);
425426
POV_ARRAY *Parse_Array_Declare (void);

source/parser/parser_tokenizer.cpp

Lines changed: 37 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,9 @@ void Parser::Initialize_Tokenizer()
9595
shared_ptr<IStream> rfile;
9696
UCS2String actualFileName;
9797

98-
pre_init_tokenizer ();
98+
pre_init_tokenizer();
9999

100-
rfile = Locate_File(sceneData->inputFile.c_str(),POV_File_Text_POV,actualFileName,true);
100+
rfile = Locate_File(sceneData->inputFile.c_str(), POV_File_Text_POV, actualFileName, true);
101101
if (rfile == nullptr)
102102
Error("Cannot open input file.");
103103

@@ -106,7 +106,7 @@ void Parser::Initialize_Tokenizer()
106106

107107
mHavePendingRawToken = false;
108108

109-
Got_EOF = false;
109+
Got_EOF = false;
110110

111111
/* Init conditional stack. */
112112

@@ -118,17 +118,33 @@ void Parser::Initialize_Tokenizer()
118118
Max_Trace_Level = MAX_TRACE_LEVEL_DEFAULT;
119119
Had_Max_Trace_Level = false;
120120

121-
/// @todo Re-enable UTF-8 signature BOM handling.
122-
#if 0
123-
/* ignore any leading characters if they have character codes above 127, this
124-
takes care of UTF-8 files with encoding info at the beginning of the file */
125-
for(c = Echo_getc(); c > 127; c = Echo_getc())
126-
sceneData->stringEncoding = kStringEncoding_UTF8; // switch to UTF-8 automatically [trf]
127-
Echo_ungetc(c);
128-
#endif
121+
CheckFileSignature();
129122
}
130123

131124

125+
//******************************************************************************
126+
127+
128+
void Parser::CheckFileSignature()
129+
{
130+
RawToken signature;
131+
if (GetRawToken(signature, false))
132+
{
133+
if (signature.expressionId == SIGNATURE_FUNCT_TOKEN)
134+
{
135+
// Found a signature. Switch to the corresponding encoding automatically.
136+
///@todo Still need to work on the mechanism to handle string encoding.
137+
switch (signature.id)
138+
{
139+
case UTF8_SIGNATURE_TOKEN: /* sceneData->stringEncoding = kStringEncoding_UTF8; */ break;
140+
default: POV_PARSER_ASSERT(false); break;
141+
}
142+
}
143+
else
144+
UngetRawToken(signature);
145+
}
146+
}
147+
132148

133149
/*****************************************************************************
134150
*
@@ -527,7 +543,7 @@ void Parser::Read_Symbol(const RawToken& rawToken)
527543

528544
haveNextRawToken = PeekRawToken(nextRawToken);
529545

530-
if (!haveNextRawToken || (nextRawToken.lexeme.category != Lexeme::Category::kOther) || (nextRawToken.lexeme.text != "["))
546+
if (!haveNextRawToken || (nextRawToken.lexeme.category != Lexeme::kOther) || (nextRawToken.lexeme.text != "["))
531547
{
532548
breakLoop = true;
533549
break;
@@ -592,7 +608,7 @@ void Parser::Read_Symbol(const RawToken& rawToken)
592608
table = Tables [pseudoDictionary];
593609
pseudoDictionary = -1;
594610

595-
if (!haveNextRawToken || (nextRawToken.lexeme.category != Lexeme::Category::kOther) ||
611+
if (!haveNextRawToken || (nextRawToken.lexeme.category != Lexeme::kOther) ||
596612
((nextRawToken.lexeme.text != "[") && (nextRawToken.lexeme.text != ".")))
597613
{
598614
Get_Token(); // ensures the error is reported at the right token
@@ -602,7 +618,7 @@ void Parser::Read_Symbol(const RawToken& rawToken)
602618
else
603619
table = reinterpret_cast<SYM_TABLE *>(*(mToken.DataPtr));
604620

605-
if (haveNextRawToken && (nextRawToken.lexeme.category == Lexeme::Category::kOther) && (nextRawToken.lexeme.text == "."))
621+
if (haveNextRawToken && (nextRawToken.lexeme.category == Lexeme::kOther) && (nextRawToken.lexeme.text == "."))
606622
{
607623
if (table == nullptr)
608624
{
@@ -621,7 +637,7 @@ void Parser::Read_Symbol(const RawToken& rawToken)
621637

622638
Temp_Entry = Find_Symbol (table, mToken.raw.lexeme.text.c_str());
623639
}
624-
else if (haveNextRawToken && (nextRawToken.lexeme.category == Lexeme::Category::kOther) && (nextRawToken.lexeme.text == "["))
640+
else if (haveNextRawToken && (nextRawToken.lexeme.category == Lexeme::kOther) && (nextRawToken.lexeme.text == "["))
625641
{
626642
if (table == nullptr)
627643
{
@@ -837,7 +853,7 @@ bool Parser::GetRawToken(RawToken& rawToken, bool fastForwardToDirective)
837853
{
838854
rawToken = mPendingRawToken;
839855
mHavePendingRawToken = false;
840-
if (!fastForwardToDirective || ((rawToken.lexeme.category == Lexeme::Category::kOther) && (rawToken.lexeme.text == "#")))
856+
if (!fastForwardToDirective || ((rawToken.lexeme.category == Lexeme::kOther) && (rawToken.lexeme.text == "#")))
841857
return true;
842858
}
843859

@@ -847,12 +863,12 @@ bool Parser::GetRawToken(RawToken& rawToken, bool fastForwardToDirective)
847863
return mTokenizer.GetNextToken(rawToken);
848864
}
849865

850-
bool Parser::PeekRawToken(RawToken& lexeme)
866+
bool Parser::PeekRawToken(RawToken& rawToken)
851867
{
852-
if (!GetRawToken(lexeme, false))
868+
if (!GetRawToken(rawToken, false))
853869
return false;
854870

855-
UngetRawToken(lexeme);
871+
UngetRawToken(rawToken);
856872
return true;
857873
}
858874

@@ -3512,6 +3528,8 @@ void Parser::IncludeHeader(const UCS2String& formalFileName)
35123528
mToken.is_array_elem = false;
35133529
mToken.is_mixed_array_elem = false;
35143530
mToken.is_dictionary_elem = false;
3531+
3532+
CheckFileSignature();
35153533
}
35163534

35173535
}

source/parser/rawtokenizer.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ bool RawTokenizer::GetNextToken(RawToken& token)
126126
case Lexeme::kFloatLiteral: if (ProcessFloatLiteralLexeme(token)) return true;
127127
case Lexeme::kStringLiteral: if (ProcessStringLiteralLexeme(token)) return true;
128128
case Lexeme::kOther: if (ProcessOtherLexeme(token)) return true;
129+
case Lexeme::kUTF8SignatureBOM: if (ProcessSignatureLexeme(token)) return true;
129130
default: POV_PARSER_ASSERT(false); return true;
130131
}
131132
}
@@ -389,9 +390,30 @@ bool RawTokenizer::ProcessOtherLexeme(RawToken& token)
389390
return true;
390391
}
391392

393+
bool RawTokenizer::ProcessSignatureLexeme(RawToken& token)
394+
{
395+
POV_PARSER_ASSERT(token.lexeme.text.size() > 0);
396+
397+
TokenId tokenId = NOT_A_TOKEN;
398+
399+
switch (token.lexeme.category)
400+
{
401+
case Lexeme::kUTF8SignatureBOM: tokenId = UTF8_SIGNATURE_TOKEN; break;
402+
default: POV_PARSER_ASSERT(false); return false;
403+
}
404+
405+
token.id = int(tokenId);
406+
token.expressionId = GetExpressionId(tokenId);
407+
token.value = nullptr;
408+
409+
return true;
410+
}
411+
392412
TokenId pov_parser::RawTokenizer::GetExpressionId(TokenId tokenId)
393413
{
394-
if (tokenId <= FLOAT_FUNCT_TOKEN)
414+
if (tokenId <= SIGNATURE_FUNCT_TOKEN)
415+
return SIGNATURE_FUNCT_TOKEN;
416+
else if (tokenId <= FLOAT_FUNCT_TOKEN)
395417
return FLOAT_FUNCT_TOKEN;
396418
else if (tokenId <= VECTOR_FUNCT_TOKEN)
397419
return VECTOR_FUNCT_TOKEN;

source/parser/rawtokenizer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ class RawTokenizer
241241
bool ProcessOtherLexeme(RawToken& token);
242242
bool ProcessFloatLiteralLexeme(RawToken& token);
243243
bool ProcessStringLiteralLexeme(RawToken& token);
244+
bool ProcessSignatureLexeme(RawToken& token);
244245

245246
bool ProcessUCSEscapeDigits(UCS4& c, UTF8String::const_iterator& i, UTF8String::const_iterator& escapeSequenceEnd, unsigned int digits);
246247

source/parser/reservedwords.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,7 @@ const RESERVED_WORD Reserved_Words[] = {
672672

673673
{ COLOUR_KEY_TOKEN, "color keyword" },
674674
{ FLOAT_FUNCT_TOKEN, "float function" },
675+
{ SIGNATURE_FUNCT_TOKEN, "signature function" },
675676
{ VECTOR_FUNCT_TOKEN, "vector function" },
676677

677678
//------------------------------------------------------------------------------
@@ -726,6 +727,7 @@ const RESERVED_WORD Reserved_Words[] = {
726727
{ END_OF_FILE_TOKEN, "End of File" },
727728
{ FLOAT_TOKEN, "float constant" },
728729
{ STRING_LITERAL_TOKEN, "string literal" },
730+
{ UTF8_SIGNATURE_TOKEN, "UTF-8 signature BOM" },
729731

730732
//------------------------------------------------------------------------------
731733
// End of list, marked by TokenId TOKEN_COUNT_ and nullptr token string.

source/parser/reservedwords.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,20 @@ struct Reserved_Word_Struct
5656
// Token Definitions for Parser
5757
enum TokenId : int
5858
{
59+
//------------------------------------------------------------------------------
60+
// Signature Tokens.
61+
//
62+
// All tokens that indicate a file format must go here.
63+
//
64+
// Please keep this section neatly sorted by the token identifier name,
65+
// sorting underscore characters before digits, digits before letters,
66+
// and short names before long ones, but _ignoring_ the trailing `_TOKEN` or
67+
// `_ID_TOKEN`.
68+
69+
UTF8_SIGNATURE_TOKEN,
70+
71+
SIGNATURE_FUNCT_TOKEN, // must be last in this section
72+
5973
//------------------------------------------------------------------------------
6074
// Float Tokens.
6175
//

source/parser/scanner.cpp

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
// C++ variants of C standard header files
4040
#include <cstdio>
4141
#include <cstdlib>
42+
#include <cstring>
4243

4344
// C++ standard header files
4445
#include <limits>
@@ -396,6 +397,16 @@ bool Scanner::GetNextLexeme(Lexeme& lexeme)
396397
if (mpSource == nullptr)
397398
return false;
398399

400+
if ((mpNextChar == maBuffer) && (mBase == 0))
401+
{
402+
// At the very start of the stream.
403+
// Check for file format signatures.
404+
405+
// Currently, only UTF-8 is supported.
406+
if (GetNextSignatureLexeme(lexeme, Lexeme::kUTF8SignatureBOM, u8"\uFEFF"))
407+
return true;
408+
}
409+
399410
while (!mEndOfStream)
400411
{
401412
// Skip over any whitespace (including blank lines).
@@ -423,7 +434,7 @@ bool Scanner::GetNextLexeme(Lexeme& lexeme)
423434
else if (*mpNextChar == '/')
424435
{
425436
// Either division operator or start of comment.
426-
lexeme.category = Lexeme::Category::kOther;
437+
lexeme.category = Lexeme::kOther;
427438
if (!CopyAndAdvance(lexeme))
428439
return true;
429440
if (*mpNextChar == '/')
@@ -442,7 +453,7 @@ bool Scanner::GetNextLexeme(Lexeme& lexeme)
442453
else if ((*mpNextChar == '!') || (*mpNextChar == '<') || (*mpNextChar == '>'))
443454
{
444455
// Either single-character operator or comparison.
445-
lexeme.category = Lexeme::Category::kOther;
456+
lexeme.category = Lexeme::kOther;
446457
if (!CopyAndAdvance(lexeme))
447458
return true;
448459
if (*mpNextChar == '=')
@@ -457,7 +468,7 @@ bool Scanner::GetNextLexeme(Lexeme& lexeme)
457468
else
458469
{
459470
// Single-character operator (or not a valid lexeme at all)
460-
lexeme.category = Lexeme::Category::kOther;
471+
lexeme.category = Lexeme::kOther;
461472
(void)CopyAndAdvance(lexeme);
462473
return true;
463474
}
@@ -513,7 +524,7 @@ bool Scanner::GetNextDirective(Lexeme& lexeme)
513524
{
514525
POV_PARSER_ASSERT(*mpNextChar == '#');
515526
// Found what we've been looking for.
516-
lexeme.category = Lexeme::Category::kOther;
527+
lexeme.category = Lexeme::kOther;
517528
(void)CopyAndAdvance(lexeme);
518529
return true;
519530
}
@@ -529,7 +540,7 @@ bool Scanner::GetNextWordLexeme(Lexeme& lexeme)
529540
POV_PARSER_ASSERT(!mEndOfStream);
530541
POV_PARSER_ASSERT(IsIdentifierChar1(*mpNextChar));
531542

532-
lexeme.category = Lexeme::Category::kWord;
543+
lexeme.category = Lexeme::kWord;
533544

534545
// Read identifier name.
535546
while (CopyAndAdvance(lexeme) && IsIdentifierChar2(*mpNextChar))
@@ -544,7 +555,7 @@ bool Scanner::GetNextFloatLiteralLexeme(Lexeme& lexeme)
544555
{
545556
POV_PARSER_ASSERT(!mEndOfStream);
546557

547-
lexeme.category = Lexeme::Category::kFloatLiteral;
558+
lexeme.category = Lexeme::kFloatLiteral;
548559

549560
if (!GetNextFloatLiteralDigits(lexeme))
550561
POV_PARSER_ASSERT(false);
@@ -577,7 +588,7 @@ bool Scanner::GetNextFloatLiteralOrDotLexeme(Lexeme& lexeme)
577588
if (CopyAndAdvance(lexeme) && IsDecimalDigit(*mpNextChar))
578589
{
579590
// Valid start of a numeric literal, starting with the decimal point.
580-
lexeme.category = Lexeme::Category::kFloatLiteral;
591+
lexeme.category = Lexeme::kFloatLiteral;
581592

582593
// Read fractional part.
583594
if (!GetNextFloatLiteralDigits(lexeme))
@@ -593,7 +604,7 @@ bool Scanner::GetNextFloatLiteralOrDotLexeme(Lexeme& lexeme)
593604
else
594605
{
595606
// Dot operator.
596-
lexeme.category = Lexeme::Category::kOther;
607+
lexeme.category = Lexeme::kOther;
597608

598609
// Dot has already been copied to lexeme.
599610

@@ -667,7 +678,7 @@ bool Scanner::GetNextStringLiteralLexeme(Lexeme& lexeme)
667678
POV_PARSER_ASSERT(!mEndOfStream);
668679
POV_PARSER_ASSERT(*mpNextChar == '"');
669680

670-
lexeme.category = Lexeme::Category::kStringLiteral;
681+
lexeme.category = Lexeme::kStringLiteral;
671682

672683
if (!CopyAndAdvance(lexeme))
673684
return false;
@@ -758,6 +769,32 @@ bool Scanner::EatNextBlockComment()
758769

759770
//------------------------------------------------------------------------------
760771

772+
bool Scanner::GetNextSignatureLexeme(Lexeme& lexeme, Lexeme::Category sigId, const Octet* sigToTest, size_t sigLength)
773+
{
774+
POV_PARSER_ASSERT(!mEndOfStream);
775+
POV_PARSER_ASSERT(mBase == 0);
776+
POV_PARSER_ASSERT(mpNextChar == maBuffer);
777+
778+
if ((mpBufferEnd - mpNextChar) < sigLength)
779+
return false;
780+
if (std::memcmp(mpNextChar, sigToTest, sigLength) != 0)
781+
return false;
782+
783+
lexeme.text = UTF8String(reinterpret_cast<const char*>(sigToTest), sigLength);
784+
lexeme.position = mCurrentPosition;
785+
lexeme.category = sigId;
786+
mpNextChar += sigLength;
787+
mCurrentPosition.offset += sigLength;
788+
return true;
789+
}
790+
791+
bool Scanner::GetNextSignatureLexeme(Lexeme& lexeme, Lexeme::Category sigId, const char* sigToTest)
792+
{
793+
return GetNextSignatureLexeme(lexeme, sigId, reinterpret_cast<const Octet*>(sigToTest), std::strlen(sigToTest));
794+
}
795+
796+
//------------------------------------------------------------------------------
797+
761798
bool Scanner::GetRaw(unsigned char* buffer, size_t size)
762799
{
763800
POV_PARSER_ASSERT(!mEndOfStream);

0 commit comments

Comments
 (0)