Merge branch 'refactor/tokenizer' into autobuild/tokenizer

c-lipka · c-lipka · commit 0a67cf1502b0 · 2018-06-01T14:20:21.000+02:00
diff --git a/source/base/version.h b/source/base/version.h
@@ -100,7 +100,7 @@
 /// where `N` is a serial number starting at 1 in each phase, `TIME` is the number of minutes
 /// since 2000-01-01 00:00, and `FEATURE` is an arbitrary alphanumeric moniker for a particular
 /// experimental feature.
-#define POV_RAY_PRERELEASE          "x.tokenizer.9684878"
+#define POV_RAY_PRERELEASE          "x.tokenizer.9686180"
 
 #if defined(DOXYGEN) && !defined(POV_RAY_PRERELEASE)
     // Work around doxygen being unable to document undefined macros.
diff --git a/source/parser/parser.h b/source/parser/parser.h
@@ -241,7 +241,7 @@ class Parser : public SceneTask
             TokenId Function_Id;                            ///< token type ID, in case Token_Id is an identifier ID
             int context;                                    ///< context the token is local to (i.e., table index)
             DBL Token_Float;                                ///< token value (if it is a float literal)
-            int Unget_Token, End_Of_File;
+            bool Unget_Token, End_Of_File;
             void *Data;                                     ///< reference to token value (if it is a non-float identifier)
             TokenId *NumberPtr;
             void **DataPtr;
@@ -420,6 +420,7 @@ class Parser : public SceneTask
         void pre_init_tokenizer (void);
         void Initialize_Tokenizer (void);
         void Terminate_Tokenizer (void);
+        void CheckFileSignature();
         SYM_ENTRY *Add_Symbol (SYM_TABLE *table, const char *Name,TokenId Number);
         SYM_ENTRY *Add_Symbol (int Index,const char *Name,TokenId Number);
         POV_ARRAY *Parse_Array_Declare (void);
diff --git a/source/parser/parser_tokenizer.cpp b/source/parser/parser_tokenizer.cpp
@@ -95,9 +95,9 @@ void Parser::Initialize_Tokenizer()
     shared_ptr<IStream> rfile;
     UCS2String actualFileName;
 
-    pre_init_tokenizer ();
+    pre_init_tokenizer();
 
-    rfile = Locate_File(sceneData->inputFile.c_str(),POV_File_Text_POV,actualFileName,true);
+    rfile = Locate_File(sceneData->inputFile.c_str(), POV_File_Text_POV, actualFileName, true);
     if (rfile == nullptr)
         Error("Cannot open input file.");
 
@@ -106,7 +106,7 @@ void Parser::Initialize_Tokenizer()
 
     mHavePendingRawToken = false;
 
-    Got_EOF  = false;
+    Got_EOF = false;
 
     /* Init conditional stack. */
 
@@ -118,17 +118,33 @@ void Parser::Initialize_Tokenizer()
     Max_Trace_Level = MAX_TRACE_LEVEL_DEFAULT;
     Had_Max_Trace_Level = false;
 
-    /// @todo Re-enable UTF-8 signature BOM handling.
-#if 0
-    /* ignore any leading characters if they have character codes above 127, this
-       takes care of UTF-8 files with encoding info at the beginning of the file */
-    for(c = Echo_getc(); c > 127; c = Echo_getc())
-        sceneData->stringEncoding = kStringEncoding_UTF8; // switch to UTF-8 automatically [trf]
-    Echo_ungetc(c);
-#endif
+    CheckFileSignature();
 }
 
 
+//******************************************************************************
+
+
+void Parser::CheckFileSignature()
+{
+    RawToken signature;
+    if (GetRawToken(signature, false))
+    {
+        if (signature.expressionId == SIGNATURE_FUNCT_TOKEN)
+        {
+            // Found a signature. Switch to the corresponding encoding automatically.
+            ///@todo Still need to work on the mechanism to handle string encoding.
+            switch (signature.id)
+            {
+                case UTF8_SIGNATURE_TOKEN:  /* sceneData->stringEncoding = kStringEncoding_UTF8; */ break;
+                default:                    POV_PARSER_ASSERT(false);                               break;
+            }
+        }
+        else
+            UngetRawToken(signature);
+    }
+}
+
 
 /*****************************************************************************
 *
@@ -527,7 +543,7 @@ void Parser::Read_Symbol(const RawToken& rawToken)
 
                             haveNextRawToken = PeekRawToken(nextRawToken);
 
-                            if (!haveNextRawToken || (nextRawToken.lexeme.category != Lexeme::Category::kOther) || (nextRawToken.lexeme.text != "["))
+                            if (!haveNextRawToken || (nextRawToken.lexeme.category != Lexeme::kOther) || (nextRawToken.lexeme.text != "["))
                             {
                                 breakLoop = true;
                                 break;
@@ -592,7 +608,7 @@ void Parser::Read_Symbol(const RawToken& rawToken)
                                 table = Tables [pseudoDictionary];
                                 pseudoDictionary = -1;
 
-                                if (!haveNextRawToken || (nextRawToken.lexeme.category != Lexeme::Category::kOther) ||
+                                if (!haveNextRawToken || (nextRawToken.lexeme.category != Lexeme::kOther) ||
                                     ((nextRawToken.lexeme.text != "[") && (nextRawToken.lexeme.text != ".")))
                                 {
                                     Get_Token(); // ensures the error is reported at the right token
@@ -602,7 +618,7 @@ void Parser::Read_Symbol(const RawToken& rawToken)
                             else
                                 table = reinterpret_cast<SYM_TABLE *>(*(mToken.DataPtr));
 
-                            if (haveNextRawToken && (nextRawToken.lexeme.category == Lexeme::Category::kOther) && (nextRawToken.lexeme.text == "."))
+                            if (haveNextRawToken && (nextRawToken.lexeme.category == Lexeme::kOther) && (nextRawToken.lexeme.text == "."))
                             {
                                 if (table == nullptr)
                                 {
@@ -621,7 +637,7 @@ void Parser::Read_Symbol(const RawToken& rawToken)
 
                                 Temp_Entry = Find_Symbol (table, mToken.raw.lexeme.text.c_str());
                             }
-                            else if (haveNextRawToken && (nextRawToken.lexeme.category == Lexeme::Category::kOther) && (nextRawToken.lexeme.text == "["))
+                            else if (haveNextRawToken && (nextRawToken.lexeme.category == Lexeme::kOther) && (nextRawToken.lexeme.text == "["))
                             {
                                 if (table == nullptr)
                                 {
@@ -837,7 +853,7 @@ bool Parser::GetRawToken(RawToken& rawToken, bool fastForwardToDirective)
     {
         rawToken = mPendingRawToken;
         mHavePendingRawToken = false;
-        if (!fastForwardToDirective || ((rawToken.lexeme.category == Lexeme::Category::kOther) && (rawToken.lexeme.text == "#")))
+        if (!fastForwardToDirective || ((rawToken.lexeme.category == Lexeme::kOther) && (rawToken.lexeme.text == "#")))
             return true;
     }
 
@@ -847,12 +863,12 @@ bool Parser::GetRawToken(RawToken& rawToken, bool fastForwardToDirective)
         return mTokenizer.GetNextToken(rawToken);
 }
 
-bool Parser::PeekRawToken(RawToken& lexeme)
+bool Parser::PeekRawToken(RawToken& rawToken)
 {
-    if (!GetRawToken(lexeme, false))
+    if (!GetRawToken(rawToken, false))
         return false;
 
-    UngetRawToken(lexeme);
+    UngetRawToken(rawToken);
     return true;
 }
 
@@ -3512,6 +3528,8 @@ void Parser::IncludeHeader(const UCS2String& formalFileName)
     mToken.is_array_elem = false;
     mToken.is_mixed_array_elem = false;
     mToken.is_dictionary_elem = false;
+
+    CheckFileSignature();
 }
 
 }
diff --git a/source/parser/rawtokenizer.cpp b/source/parser/rawtokenizer.cpp
@@ -126,6 +126,7 @@ bool RawTokenizer::GetNextToken(RawToken& token)
         case Lexeme::kFloatLiteral:     if (ProcessFloatLiteralLexeme(token))   return true;
         case Lexeme::kStringLiteral:    if (ProcessStringLiteralLexeme(token))  return true;
         case Lexeme::kOther:            if (ProcessOtherLexeme(token))          return true;
+        case Lexeme::kUTF8SignatureBOM: if (ProcessSignatureLexeme(token))      return true;
         default:                        POV_PARSER_ASSERT(false);               return true;
     }
 }
@@ -389,9 +390,30 @@ bool RawTokenizer::ProcessOtherLexeme(RawToken& token)
     return true;
 }
 
+bool RawTokenizer::ProcessSignatureLexeme(RawToken& token)
+{
+    POV_PARSER_ASSERT(token.lexeme.text.size() > 0);
+
+    TokenId tokenId = NOT_A_TOKEN;
+
+    switch (token.lexeme.category)
+    {
+        case Lexeme::kUTF8SignatureBOM: tokenId = UTF8_SIGNATURE_TOKEN; break;
+        default:                        POV_PARSER_ASSERT(false);       return false;
+    }
+
+    token.id = int(tokenId);
+    token.expressionId = GetExpressionId(tokenId);
+    token.value = nullptr;
+
+    return true;
+}
+
 TokenId pov_parser::RawTokenizer::GetExpressionId(TokenId tokenId)
 {
-    if (tokenId <= FLOAT_FUNCT_TOKEN)
+    if (tokenId <= SIGNATURE_FUNCT_TOKEN)
+        return SIGNATURE_FUNCT_TOKEN;
+    else if (tokenId <= FLOAT_FUNCT_TOKEN)
         return FLOAT_FUNCT_TOKEN;
     else if (tokenId <= VECTOR_FUNCT_TOKEN)
         return VECTOR_FUNCT_TOKEN;
diff --git a/source/parser/rawtokenizer.h b/source/parser/rawtokenizer.h
@@ -241,6 +241,7 @@ class RawTokenizer
     bool ProcessOtherLexeme(RawToken& token);
     bool ProcessFloatLiteralLexeme(RawToken& token);
     bool ProcessStringLiteralLexeme(RawToken& token);
+    bool ProcessSignatureLexeme(RawToken& token);
 
     bool ProcessUCSEscapeDigits(UCS4& c, UTF8String::const_iterator& i, UTF8String::const_iterator& escapeSequenceEnd, unsigned int digits);
 
diff --git a/source/parser/reservedwords.cpp b/source/parser/reservedwords.cpp
@@ -672,6 +672,7 @@ const RESERVED_WORD Reserved_Words[] = {
 
     { COLOUR_KEY_TOKEN,             "color keyword" },
     { FLOAT_FUNCT_TOKEN,            "float function" },
+    { SIGNATURE_FUNCT_TOKEN,        "signature function" },
     { VECTOR_FUNCT_TOKEN,           "vector function" },
 
     //------------------------------------------------------------------------------
@@ -726,6 +727,7 @@ const RESERVED_WORD Reserved_Words[] = {
     { END_OF_FILE_TOKEN,            "End of File" },
     { FLOAT_TOKEN,                  "float constant" },
     { STRING_LITERAL_TOKEN,         "string literal" },
+    { UTF8_SIGNATURE_TOKEN,         "UTF-8 signature BOM" },
 
     //------------------------------------------------------------------------------
     // End of list, marked by TokenId TOKEN_COUNT_ and nullptr token string.
diff --git a/source/parser/reservedwords.h b/source/parser/reservedwords.h
@@ -56,6 +56,20 @@ struct Reserved_Word_Struct
 // Token Definitions for Parser
 enum TokenId : int
 {
+    //------------------------------------------------------------------------------
+    // Signature Tokens.
+    //
+    // All tokens that indicate a file format must go here.
+    //
+    // Please keep this section neatly sorted by the token identifier name,
+    // sorting underscore characters before digits, digits before letters,
+    // and short names before long ones, but _ignoring_ the trailing `_TOKEN` or
+    // `_ID_TOKEN`.
+
+    UTF8_SIGNATURE_TOKEN,
+
+    SIGNATURE_FUNCT_TOKEN, // must be last in this section
+
     //------------------------------------------------------------------------------
     // Float Tokens.
     //
diff --git a/source/parser/scanner.cpp b/source/parser/scanner.cpp
@@ -39,6 +39,7 @@
 // C++ variants of C standard header files
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
 
 // C++ standard header files
 #include <limits>
@@ -396,6 +397,16 @@ bool Scanner::GetNextLexeme(Lexeme& lexeme)
     if (mpSource == nullptr)
         return false;
 
+    if ((mpNextChar == maBuffer) && (mBase == 0))
+    {
+        // At the very start of the stream.
+        // Check for file format signatures.
+
+        // Currently, only UTF-8 is supported.
+        if (GetNextSignatureLexeme(lexeme, Lexeme::kUTF8SignatureBOM, u8"\uFEFF"))
+            return true;
+    }
+
     while (!mEndOfStream)
     {
         // Skip over any whitespace (including blank lines).
@@ -423,7 +434,7 @@ bool Scanner::GetNextLexeme(Lexeme& lexeme)
         else if (*mpNextChar == '/')
         {
             // Either division operator or start of comment.
-            lexeme.category = Lexeme::Category::kOther;
+            lexeme.category = Lexeme::kOther;
             if (!CopyAndAdvance(lexeme))
                 return true;
             if (*mpNextChar == '/')
@@ -442,7 +453,7 @@ bool Scanner::GetNextLexeme(Lexeme& lexeme)
         else if ((*mpNextChar == '!') || (*mpNextChar == '<') || (*mpNextChar == '>'))
         {
             // Either single-character operator or comparison.
-            lexeme.category = Lexeme::Category::kOther;
+            lexeme.category = Lexeme::kOther;
             if (!CopyAndAdvance(lexeme))
                 return true;
             if (*mpNextChar == '=')
@@ -457,7 +468,7 @@ bool Scanner::GetNextLexeme(Lexeme& lexeme)
         else
         {
             // Single-character operator (or not a valid lexeme at all)
-            lexeme.category = Lexeme::Category::kOther;
+            lexeme.category = Lexeme::kOther;
             (void)CopyAndAdvance(lexeme);
             return true;
         }
@@ -513,7 +524,7 @@ bool Scanner::GetNextDirective(Lexeme& lexeme)
         {
             POV_PARSER_ASSERT(*mpNextChar == '#');
             // Found what we've been looking for.
-            lexeme.category = Lexeme::Category::kOther;
+            lexeme.category = Lexeme::kOther;
             (void)CopyAndAdvance(lexeme);
             return true;
         }
@@ -529,7 +540,7 @@ bool Scanner::GetNextWordLexeme(Lexeme& lexeme)
     POV_PARSER_ASSERT(!mEndOfStream);
     POV_PARSER_ASSERT(IsIdentifierChar1(*mpNextChar));
 
-    lexeme.category = Lexeme::Category::kWord;
+    lexeme.category = Lexeme::kWord;
 
     // Read identifier name.
     while (CopyAndAdvance(lexeme) && IsIdentifierChar2(*mpNextChar))
@@ -544,7 +555,7 @@ bool Scanner::GetNextFloatLiteralLexeme(Lexeme& lexeme)
 {
     POV_PARSER_ASSERT(!mEndOfStream);
 
-    lexeme.category = Lexeme::Category::kFloatLiteral;
+    lexeme.category = Lexeme::kFloatLiteral;
 
     if (!GetNextFloatLiteralDigits(lexeme))
         POV_PARSER_ASSERT(false);
@@ -577,7 +588,7 @@ bool Scanner::GetNextFloatLiteralOrDotLexeme(Lexeme& lexeme)
     if (CopyAndAdvance(lexeme) && IsDecimalDigit(*mpNextChar))
     {
         // Valid start of a numeric literal, starting with the decimal point.
-        lexeme.category = Lexeme::Category::kFloatLiteral;
+        lexeme.category = Lexeme::kFloatLiteral;
 
         // Read fractional part.
         if (!GetNextFloatLiteralDigits(lexeme))
@@ -593,7 +604,7 @@ bool Scanner::GetNextFloatLiteralOrDotLexeme(Lexeme& lexeme)
     else
     {
         // Dot operator.
-        lexeme.category = Lexeme::Category::kOther;
+        lexeme.category = Lexeme::kOther;
 
         // Dot has already been copied to lexeme.
 
@@ -667,7 +678,7 @@ bool Scanner::GetNextStringLiteralLexeme(Lexeme& lexeme)
     POV_PARSER_ASSERT(!mEndOfStream);
     POV_PARSER_ASSERT(*mpNextChar == '"');
 
-    lexeme.category = Lexeme::Category::kStringLiteral;
+    lexeme.category = Lexeme::kStringLiteral;
 
     if (!CopyAndAdvance(lexeme))
         return false;
@@ -758,6 +769,32 @@ bool Scanner::EatNextBlockComment()
 
 //------------------------------------------------------------------------------
 
+bool Scanner::GetNextSignatureLexeme(Lexeme& lexeme, Lexeme::Category sigId, const Octet* sigToTest, size_t sigLength)
+{
+    POV_PARSER_ASSERT(!mEndOfStream);
+    POV_PARSER_ASSERT(mBase == 0);
+    POV_PARSER_ASSERT(mpNextChar == maBuffer);
+
+    if ((mpBufferEnd - mpNextChar) < sigLength)
+        return false;
+    if (std::memcmp(mpNextChar, sigToTest, sigLength) != 0)
+        return false;
+
+    lexeme.text = UTF8String(reinterpret_cast<const char*>(sigToTest), sigLength);
+    lexeme.position = mCurrentPosition;
+    lexeme.category = sigId;
+    mpNextChar += sigLength;
+    mCurrentPosition.offset += sigLength;
+    return true;
+}
+
+bool Scanner::GetNextSignatureLexeme(Lexeme& lexeme, Lexeme::Category sigId, const char* sigToTest)
+{
+    return GetNextSignatureLexeme(lexeme, sigId, reinterpret_cast<const Octet*>(sigToTest), std::strlen(sigToTest));
+}
+
+//------------------------------------------------------------------------------
+
 bool Scanner::GetRaw(unsigned char* buffer, size_t size)
 {
     POV_PARSER_ASSERT(!mEndOfStream);
diff --git a/source/parser/scanner.h b/source/parser/scanner.h