Re-implement support for UTF-8 encoded source files with signature BOM.

c-lipka · c-lipka · commit db025cfd55ad · 2018-06-01T13:19:20.000+02:00
diff --git a/source/parser/parser.h b/source/parser/parser.h
@@ -420,6 +420,7 @@ class Parser : public SceneTask
         void pre_init_tokenizer (void);
         void Initialize_Tokenizer (void);
         void Terminate_Tokenizer (void);
+        void CheckFileSignature();
         SYM_ENTRY *Add_Symbol (SYM_TABLE *table, const char *Name,TokenId Number);
         SYM_ENTRY *Add_Symbol (int Index,const char *Name,TokenId Number);
         POV_ARRAY *Parse_Array_Declare (void);
diff --git a/source/parser/parser_tokenizer.cpp b/source/parser/parser_tokenizer.cpp
@@ -95,9 +95,9 @@ void Parser::Initialize_Tokenizer()
     shared_ptr<IStream> rfile;
     UCS2String actualFileName;
 
-    pre_init_tokenizer ();
+    pre_init_tokenizer();
 
-    rfile = Locate_File(sceneData->inputFile.c_str(),POV_File_Text_POV,actualFileName,true);
+    rfile = Locate_File(sceneData->inputFile.c_str(), POV_File_Text_POV, actualFileName, true);
     if (rfile == nullptr)
         Error("Cannot open input file.");
 
@@ -106,7 +106,7 @@ void Parser::Initialize_Tokenizer()
 
     mHavePendingRawToken = false;
 
-    Got_EOF  = false;
+    Got_EOF = false;
 
     /* Init conditional stack. */
 
@@ -118,17 +118,33 @@ void Parser::Initialize_Tokenizer()
     Max_Trace_Level = MAX_TRACE_LEVEL_DEFAULT;
     Had_Max_Trace_Level = false;
 
-    /// @todo Re-enable UTF-8 signature BOM handling.
-#if 0
-    /* ignore any leading characters if they have character codes above 127, this
-       takes care of UTF-8 files with encoding info at the beginning of the file */
-    for(c = Echo_getc(); c > 127; c = Echo_getc())
-        sceneData->stringEncoding = kStringEncoding_UTF8; // switch to UTF-8 automatically [trf]
-    Echo_ungetc(c);
-#endif
+    CheckFileSignature();
 }
 
 
+//******************************************************************************
+
+
+void Parser::CheckFileSignature()
+{
+    RawToken signature;
+    if (GetRawToken(signature, false))
+    {
+        if (signature.expressionId == SIGNATURE_FUNCT_TOKEN)
+        {
+            // Found a signature. Switch to the corresponding encoding automatically.
+            ///@todo Still need to work on the mechanism to handle string encoding.
+            switch (signature.id)
+            {
+                case UTF8_SIGNATURE_TOKEN:  /* sceneData->stringEncoding = kStringEncoding_UTF8; */ break;
+                default:                    POV_PARSER_ASSERT(false);                               break;
+            }
+        }
+        else
+            UngetRawToken(signature);
+    }
+}
+
 
 /*****************************************************************************
 *
@@ -3512,6 +3528,8 @@ void Parser::IncludeHeader(const UCS2String& formalFileName)
     mToken.is_array_elem = false;
     mToken.is_mixed_array_elem = false;
     mToken.is_dictionary_elem = false;
+
+    CheckFileSignature();
 }
 
 }
diff --git a/source/parser/rawtokenizer.cpp b/source/parser/rawtokenizer.cpp
@@ -126,6 +126,7 @@ bool RawTokenizer::GetNextToken(RawToken& token)
         case Lexeme::kFloatLiteral:     if (ProcessFloatLiteralLexeme(token))   return true;
         case Lexeme::kStringLiteral:    if (ProcessStringLiteralLexeme(token))  return true;
         case Lexeme::kOther:            if (ProcessOtherLexeme(token))          return true;
+        case Lexeme::kUTF8SignatureBOM: if (ProcessSignatureLexeme(token))      return true;
         default:                        POV_PARSER_ASSERT(false);               return true;
     }
 }
@@ -389,9 +390,30 @@ bool RawTokenizer::ProcessOtherLexeme(RawToken& token)
     return true;
 }
 
+bool RawTokenizer::ProcessSignatureLexeme(RawToken& token)
+{
+    POV_PARSER_ASSERT(token.lexeme.text.size() > 0);
+
+    TokenId tokenId = NOT_A_TOKEN;
+
+    switch (token.lexeme.category)
+    {
+        case Lexeme::kUTF8SignatureBOM: tokenId = UTF8_SIGNATURE_TOKEN; break;
+        default:                        POV_PARSER_ASSERT(false);       return false;
+    }
+
+    token.id = int(tokenId);
+    token.expressionId = GetExpressionId(tokenId);
+    token.value = nullptr;
+
+    return true;
+}
+
 TokenId pov_parser::RawTokenizer::GetExpressionId(TokenId tokenId)
 {
-    if (tokenId <= FLOAT_FUNCT_TOKEN)
+    if (tokenId <= SIGNATURE_FUNCT_TOKEN)
+        return SIGNATURE_FUNCT_TOKEN;
+    else if (tokenId <= FLOAT_FUNCT_TOKEN)
         return FLOAT_FUNCT_TOKEN;
     else if (tokenId <= VECTOR_FUNCT_TOKEN)
         return VECTOR_FUNCT_TOKEN;
diff --git a/source/parser/rawtokenizer.h b/source/parser/rawtokenizer.h
@@ -241,6 +241,7 @@ class RawTokenizer
     bool ProcessOtherLexeme(RawToken& token);
     bool ProcessFloatLiteralLexeme(RawToken& token);
     bool ProcessStringLiteralLexeme(RawToken& token);
+    bool ProcessSignatureLexeme(RawToken& token);
 
     bool ProcessUCSEscapeDigits(UCS4& c, UTF8String::const_iterator& i, UTF8String::const_iterator& escapeSequenceEnd, unsigned int digits);
 
diff --git a/source/parser/reservedwords.cpp b/source/parser/reservedwords.cpp
@@ -672,6 +672,7 @@ const RESERVED_WORD Reserved_Words[] = {
 
     { COLOUR_KEY_TOKEN,             "color keyword" },
     { FLOAT_FUNCT_TOKEN,            "float function" },
+    { SIGNATURE_FUNCT_TOKEN,        "signature function" },
     { VECTOR_FUNCT_TOKEN,           "vector function" },
 
     //------------------------------------------------------------------------------
@@ -726,6 +727,7 @@ const RESERVED_WORD Reserved_Words[] = {
     { END_OF_FILE_TOKEN,            "End of File" },
     { FLOAT_TOKEN,                  "float constant" },
     { STRING_LITERAL_TOKEN,         "string literal" },
+    { UTF8_SIGNATURE_TOKEN,         "UTF-8 signature BOM" },
 
     //------------------------------------------------------------------------------
     // End of list, marked by TokenId TOKEN_COUNT_ and nullptr token string.
diff --git a/source/parser/reservedwords.h b/source/parser/reservedwords.h
@@ -56,6 +56,20 @@ struct Reserved_Word_Struct
 // Token Definitions for Parser
 enum TokenId : int
 {
+    //------------------------------------------------------------------------------
+    // Signature Tokens.
+    //
+    // All tokens that indicate a file format must go here.
+    //
+    // Please keep this section neatly sorted by the token identifier name,
+    // sorting underscore characters before digits, digits before letters,
+    // and short names before long ones, but _ignoring_ the trailing `_TOKEN` or
+    // `_ID_TOKEN`.
+
+    UTF8_SIGNATURE_TOKEN,
+
+    SIGNATURE_FUNCT_TOKEN, // must be last in this section
+
     //------------------------------------------------------------------------------
     // Float Tokens.
     //
diff --git a/source/parser/scanner.cpp b/source/parser/scanner.cpp
@@ -39,6 +39,7 @@
 // C++ variants of C standard header files
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
 
 // C++ standard header files
 #include <limits>
@@ -396,6 +397,16 @@ bool Scanner::GetNextLexeme(Lexeme& lexeme)
     if (mpSource == nullptr)
         return false;
 
+    if ((mpNextChar == maBuffer) && (mBase == 0))
+    {
+        // At the very start of the stream.
+        // Check for file format signatures.
+
+        // Currently, only UTF-8 is supported.
+        if (GetNextSignatureLexeme(lexeme, Lexeme::kUTF8SignatureBOM, u8"\uFEFF"))
+            return true;
+    }
+
     while (!mEndOfStream)
     {
         // Skip over any whitespace (including blank lines).
@@ -758,6 +769,32 @@ bool Scanner::EatNextBlockComment()
 
 //------------------------------------------------------------------------------
 
+bool Scanner::GetNextSignatureLexeme(Lexeme& lexeme, Lexeme::Category sigId, const Octet* sigToTest, size_t sigLength)
+{
+    POV_PARSER_ASSERT(!mEndOfStream);
+    POV_PARSER_ASSERT(mBase == 0);
+    POV_PARSER_ASSERT(mpNextChar == maBuffer);
+
+    if ((mpBufferEnd - mpNextChar) < sigLength)
+        return false;
+    if (std::memcmp(mpNextChar, sigToTest, sigLength) != 0)
+        return false;
+
+    lexeme.text = UTF8String(reinterpret_cast<const char*>(sigToTest), sigLength);
+    lexeme.position = mCurrentPosition;
+    lexeme.category = sigId;
+    mpNextChar += sigLength;
+    mCurrentPosition.offset += sigLength;
+    return true;
+}
+
+bool Scanner::GetNextSignatureLexeme(Lexeme& lexeme, Lexeme::Category sigId, const char* sigToTest)
+{
+    return GetNextSignatureLexeme(lexeme, sigId, reinterpret_cast<const Octet*>(sigToTest), std::strlen(sigToTest));
+}
+
+//------------------------------------------------------------------------------
+
 bool Scanner::GetRaw(unsigned char* buffer, size_t size)
 {
     POV_PARSER_ASSERT(!mEndOfStream);
diff --git a/source/parser/scanner.h b/source/parser/scanner.h
@@ -75,6 +75,7 @@ struct Lexeme
         kFloatLiteral,
         kStringLiteral,
         kOther,
+        kUTF8SignatureBOM,
     };
     UTF8String      text;
     LexemePosition  position;
@@ -224,6 +225,9 @@ class Scanner
     void EatNextLineComment();
     bool EatNextBlockComment();
 
+    bool GetNextSignatureLexeme(Lexeme& lexeme, Lexeme::Category sigId, const Octet* sigToTest, size_t sigLength);
+    bool GetNextSignatureLexeme(Lexeme& lexeme, Lexeme::Category sigId, const char* sigToTest);
+
     /// Copy character to lexeme, then advance stream.
     /// @return `true` if another character is available.
     bool CopyAndAdvance(Lexeme& lexeme);