diff --git a/content/courses/advanced-ada/parts/data_types/strings.rst b/content/courses/advanced-ada/parts/data_types/strings.rst index 9e61479ea..c937b1c40 100644 --- a/content/courses/advanced-ada/parts/data_types/strings.rst +++ b/content/courses/advanced-ada/parts/data_types/strings.rst @@ -550,16 +550,157 @@ not the standard "x" from the :wikipedia:`Basic Latin block `.) -.. - TO BE DONE: +UTF-16 encoding and decoding +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +So far, we've discussed the UTF-8 encoding scheme. However, other encoding +schemes exist and are supported as well. In fact, the +:ada:`Ada.Strings.UTF_Encoding` package defines three encoding schemes: + +.. code-block:: ada + + type Encoding_Scheme is (UTF_8, + UTF_16BE, + UTF_16LE); + +For example, instead of using UTF-8 encoding, we can use UTF-16 encoding +|mdash| either in the big-endian or in the little-endian version. +To convert between UTF-8 and UTF-16 encoding schemes, we can make use of the +conversion functions from the :ada:`Ada.Strings.UTF_Encoding.Conversions` +package. + +To declare a UTF-16 encoded string, we can use one of the following data types: + +- the 8-bit-character based :ada:`UTF_String` type, or + +- the 16-bit-character based :ada:`UTF_16_Wide_String` type. + +When using the 8-bit version, though, we have to specify the input and output +schemes when converting between UTF-8 and UTF-16 encoding schemes. + +Let's see a code example that makes use of both :ada:`UTF_String` and +:ada:`UTF_16_Wide_String` types: + +.. code:: ada run_button project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_16_Types + + with Ada.Text_IO; use Ada.Text_IO; + + with Ada.Strings.UTF_Encoding; + use Ada.Strings.UTF_Encoding; + + with Ada.Strings.UTF_Encoding.Conversions; + use Ada.Strings.UTF_Encoding.Conversions; + + procedure Show_UTF16_Types is + Symbols_UTF_8 : constant + UTF_8_String := "♥♫"; + + Symbols_UTF_16 : constant + UTF_16_Wide_String := + Convert (Symbols_UTF_8); + -- ^ Calling Convert for UTF_8_String + -- to UTF_16_Wide_String conversion. + + Symbols_UTF_16BE : constant + UTF_String := + Convert (Item => Symbols_UTF_8, + Input_Scheme => UTF_8, + Output_Scheme => UTF_16BE); + -- ^ Calling Convert for UTF_8_String + -- to UTF_String conversion in UTF-16BE + -- encoding. + begin + Put_Line ("UTF_8_String: " + & Symbols_UTF_8); + + Put_Line ("UTF_16_Wide_String: " + & Convert (Symbols_UTF_16)); + -- ^ Calling Convert for + -- the UTF_16_Wide_String to + -- UTF_8_String conversion. + + Put_Line + ("UTF_String / UTF_16BE: " + & Convert + (Item => Symbols_UTF_16BE, + Input_Scheme => UTF_16BE, + Output_Scheme => UTF_8)); + end Show_UTF16_Types; - Parsing UTF-8 files for Wide-Wide-String processing - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +In this example, we're declaring a UTF-8 encoded string and storing it in the +:ada:`Symbols_UTF_8` constant. Then, we're calling the :ada:`Convert` +functions to convert between UTF-8 and UTF-16 encoding schemes. We're using two +versions of this function: - .. todo:: +- the :ada:`Convert` function that returns an object of + :ada:`UTF_16_Wide_String` type for an input of :ada:`UTF_8_String` type, and - - Complete section! +- the :ada:`Convert` function that returns an object of :ada:`UTF_String` + type for an input of :ada:`UTF_8_String` type. + - In this case, we need to specify the input and output schemes (see + :ada:`Input_Scheme` and :ada:`Output_Scheme` parameters in the code + example). + +Previously, we've seen that the +:ada:`Ada.Strings.UTF_Encoding.Wide_Wide_Strings` package offers functions to +convert between UTF-8 and the :ada:`Wide_Wide_String` type. The same kind of +conversion functions exist for UTF-16 strings as well. Let's look at this code +example: + +.. code:: ada run_button project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.WW_UTF_16_String + + with Ada.Text_IO; use Ada.Text_IO; + + with Ada.Strings.UTF_Encoding; + use Ada.Strings.UTF_Encoding; + + with Ada.Strings.UTF_Encoding.Wide_Wide_Strings; + use Ada.Strings.UTF_Encoding.Wide_Wide_Strings; + + with Ada.Strings.UTF_Encoding.Conversions; + use Ada.Strings.UTF_Encoding.Conversions; + + procedure Show_WW_UTF16_String is + Symbols_UTF_16 : constant + UTF_16_Wide_String := + Wide_Character'Val (16#2665#) & + Wide_Character'Val (16#266B#); + -- ^ Calling Wide_Character'Val + -- to specify the UTF-16 BE code + -- for "♥" and "♫". + + Symbols_WWS : constant + Wide_Wide_String := + Decode (Symbols_UTF_16); + -- ^ Calling Decode for UTF_16_Wide_String + -- to Wide_Wide_String conversion. + begin + Put_Line ("UTF_16_Wide_String: " + & Convert (Symbols_UTF_16)); + -- ^ Calling Convert for the + -- UTF_16_Wide_String to + -- UTF_8_String conversion. + + Put_Line ("Wide_Wide_String: " + & Encode (Symbols_WWS)); + -- ^ Calling Encode for the + -- Wide_Wide_String to + -- UTF_8_String conversion. + end Show_WW_UTF16_String; + +In this example, we're calling the :ada:`Wide_Character'Val` function to +specify the UTF-16 BE code of the "♥" and "♫" symbols. We're then using +the :ada:`Decode` function to convert between the :ada:`UTF_16_Wide_String` and +the :ada:`Wide_Wide_String` types. + + +UTF-8 applications +------------------ + +In this section, we take a further look into UTF-8 encoding and some real-world +applications. First, we discuss the use of UTF-8 encoding in source-code files. +Then, we talk about parsing UTF-8 files using *wide-wide* strings. UTF-8 encoding in source-code files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -652,6 +793,8 @@ Otherwise, we might get unexpected behavior. (Interpreting the characters in UTF-8 format as Latin-1 format is certainly an example of what we want to avoid here.) +.. _Adv_Ada_GNAT_W8_Switch: + .. admonition:: In the GNAT toolchain You can use UTF-8 coding in your source-code file and initialize strings of @@ -773,105 +916,158 @@ Here, we use a sequence of three calls to the :ada:`Character'Val(code)` function for the UTF-8 code that corresponds to the "★" symbol. -UTF-16 encoding and decoding -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -So far, we've discussed the UTF-8 encoding scheme. However, other encoding -schemes exist and are supported as well. In fact, the -:ada:`Ada.Strings.UTF_Encoding` package defines three encoding schemes: +.. _Adv_Ada_UTF_8_Files_Wide_Wide_Strings: -.. code-block:: ada +Parsing UTF-8 files for Wide-Wide-String processing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - type Encoding_Scheme is (UTF_8, - UTF_16BE, - UTF_16LE); +A typical use-case is to parse a text file in UTF-8 format and use *wide-wide* +strings to process the lines of that file. Before we look at the implementation +that does that, let's first write a procedure that generate a text file in +UTF-8 format: -For example, instead of using UTF-8 encoding, we can use UTF-16 encoding -|mdash| either in the big-endian or in the little-endian version. -To convert between UTF-8 and UTF-16 encoding schemes, we can make use of the -conversion functions from the :ada:`Ada.Strings.UTF_Encoding.Conversions` -package. +.. code:: ada no_button project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_8_File_Processing -To declare a UTF-16 encoded string, we can use one of the following data types: + with Ada.Text_IO; use Ada.Text_IO; -- the 8-bit-character based :ada:`UTF_String` type, or + with Ada.Strings.UTF_Encoding; + use Ada.Strings.UTF_Encoding; -- the 16-bit-character based :ada:`UTF_16_Wide_String` type. + procedure Generate_UTF_8_File + (Output_File_Name : String) + is + F : File_Type; + begin + Create (F, Out_File, Output_File_Name); + Put_Line (F, UTF_8_String'("♥♫")); + Put_Line + (F, + UTF_8_String'("مرحبا يا عالم")); + Close (F); + end Generate_UTF_8_File; -When using the 8-bit version, though, we have to specify the input and output -schemes when converting between UTF-8 and UTF-16 encoding schemes. +Procedure :ada:`Generate_UTF_8_File` writes two strings with non-Latin +characters into the UTF-8 file indicated by the :ada:`Output_File_Name` +parameter. -Let's see a code example that makes use of both :ada:`UTF_String` and -:ada:`UTF_16_Wide_String` types: +In addition, let's implement an auxiliary procedure to display the individual +characters of a *wide-wide* string: -.. code:: ada run_button project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_16_Types +.. code:: ada no_button project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_8_File_Processing with Ada.Text_IO; use Ada.Text_IO; with Ada.Strings.UTF_Encoding; use Ada.Strings.UTF_Encoding; - with Ada.Strings.UTF_Encoding.Conversions; - use Ada.Strings.UTF_Encoding.Conversions; + with Ada.Strings.UTF_Encoding.Wide_Wide_Strings; + use Ada.Strings.UTF_Encoding.Wide_Wide_Strings; - procedure Show_UTF16_Types is - Symbols_UTF_8 : constant - UTF_8_String := "♥♫"; + procedure Put_Line_UTF_8_Characters + (WSS : Wide_Wide_String) + is + procedure Put_Complete_UTF_8_String + (WSS : Wide_Wide_String) + is + S_UTF_8 : constant UTF_8_String := + Encode (WSS); + begin + Put_Line ("STRING: " & S_UTF_8); + Put_Line ("Length: " + & WSS'Length'Image + & " characters"); + New_Line; + end Put_Complete_UTF_8_String; + + -- This is a wrapper function of the + -- Encode function for the + -- Wide_Wide_Character type: + function Encode (Item : Wide_Wide_Character) + return UTF_8_String + is + SC : constant Wide_Wide_String (1 .. 1) + := (1 => Item); + -- We need a 1-character string + -- for the call to Encode. + begin + return Encode (SC); + end Encode; - Symbols_UTF_16 : constant - UTF_16_Wide_String := - Convert (Symbols_UTF_8); - -- ^ Calling Convert for UTF_8_String - -- to UTF_16_Wide_String conversion. + procedure Put_UTF_8_Characters + (WSS : Wide_Wide_String) is + begin + for I in WSS'Range loop + Put (I'Image & ": "); + Put (Encode (WSS (I))); + New_Line; + end loop; + end Put_UTF_8_Characters; - Symbols_UTF_16BE : constant - UTF_String := - Convert (Item => Symbols_UTF_8, - Input_Scheme => UTF_8, - Output_Scheme => UTF_16BE); - -- ^ Calling Convert for UTF_8_String - -- to UTF_String conversion in UTF-16BE - -- encoding. begin - Put_Line ("UTF_8_String: " - & Symbols_UTF_8); + Put_Complete_UTF_8_String (WSS); + Put_UTF_8_Characters (WSS); + Put_Line ("--------------------"); + end Put_Line_UTF_8_Characters; - Put_Line ("UTF_16_Wide_String: " - & Convert (Symbols_UTF_16)); - -- ^ Calling Convert for - -- the UTF_16_Wide_String to - -- UTF_8_String conversion. +Finally, let's look at a code example that parses an UTF-8 file: - Put_Line - ("UTF_String / UTF_16BE: " - & Convert - (Item => Symbols_UTF_16BE, - Input_Scheme => UTF_16BE, - Output_Scheme => UTF_8)); - end Show_UTF16_Types; +.. code:: ada run_button main=show_utf_8.adb project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_8_File_Processing -In this example, we're declaring a UTF-8 encoded string and storing it in the -:ada:`Symbols_UTF_8` constant. Then, we're calling the :ada:`Convert` -functions to convert between UTF-8 and UTF-16 encoding schemes. We're using two -versions of this function: + with Ada.Text_IO; use Ada.Text_IO; -- the :ada:`Convert` function that returns an object of - :ada:`UTF_16_Wide_String` type for an input of :ada:`UTF_8_String` type, and + with Ada.Strings.UTF_Encoding; + use Ada.Strings.UTF_Encoding; -- the :ada:`Convert` function that returns an object of :ada:`UTF_String` - type for an input of :ada:`UTF_8_String` type. + with Ada.Strings.UTF_Encoding.Wide_Wide_Strings; + use Ada.Strings.UTF_Encoding.Wide_Wide_Strings; - - In this case, we need to specify the input and output schemes (see - :ada:`Input_Scheme` and :ada:`Output_Scheme` parameters in the code - example). + with Generate_UTF_8_File; + with Put_Line_UTF_8_Characters; -Previously, we've seen that the -:ada:`Ada.Strings.UTF_Encoding.Wide_Wide_Strings` package offers functions to -convert between UTF-8 and the :ada:`Wide_Wide_String` type. The same kind of -conversion functions exist for UTF-16 strings as well. Let's look at this code -example: + procedure Show_UTF_8 is -.. code:: ada run_button project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.WW_UTF_16_String + File_Name : constant String := + "utf-8_test.txt"; + + procedure Read_UTF_8_File + (Input_File_Name : String) + is + F : File_Type; + begin + Open (F, In_File, Input_File_Name); + + while not End_Of_File (F) loop + declare + S_UTF8 : constant UTF_8_String + := Get_Line (F); + S : constant Wide_Wide_String + := Decode (S_UTF8); + begin + Put_Line_UTF_8_Characters (S); + end; + end loop; + Close (F); + end Read_UTF_8_File; + + begin + Generate_UTF_8_File (File_Name); + Read_UTF_8_File (File_Name); + end Show_UTF_8; + +The :ada:`Show_UTF_8` procedure first calls the :ada:`Generate_UTF_8_File` +procedure to generate a text file in UTF-8 format, and then calls the nested +:ada:`Read_UTF_8_File` procedure to read from that file |mdash| this is done by +reading the 8-bit UTF-8 encoded string and decoding it into a string of +:ada:`Wide_Wide_String` type. + +(Note that we call the auxiliary :ada:`Put_Line_UTF_8_Characters` procedure to +display the characters of each line we read from the UTF-8 file.) + +For completeness, we include the nested :ada:`Read_Write_UTF_8_File` procedure, +which not only reads each line from a UTF-8 file, but also writes it into +another UTF-8 file: + +.. code:: ada run_button main=show_utf_8.adb project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_8_File_Processing with Ada.Text_IO; use Ada.Text_IO; @@ -881,41 +1077,160 @@ example: with Ada.Strings.UTF_Encoding.Wide_Wide_Strings; use Ada.Strings.UTF_Encoding.Wide_Wide_Strings; - with Ada.Strings.UTF_Encoding.Conversions; - use Ada.Strings.UTF_Encoding.Conversions; + with Generate_UTF_8_File; + with Put_Line_UTF_8_Characters; - procedure Show_WW_UTF16_String is - Symbols_UTF_16 : constant - UTF_16_Wide_String := - Wide_Character'Val (16#2665#) & - Wide_Character'Val (16#266B#); - -- ^ Calling Wide_Character'Val - -- to specify the UTF-16 BE code - -- for "♥" and "♫". + procedure Show_UTF_8 is + + File_Name_In : constant String := + "utf-8_test.txt"; + File_Name_Out : constant String := + "utf-8_copy.txt"; + + procedure Read_Write_UTF_8_File + (Input_File_Name, + Output_File_Name : String) + is + F_In, F_Out : File_Type; + begin + Open (F_In, In_File, Input_File_Name); + Create (F_Out, Out_File, Output_File_Name); + + while not End_Of_File (F_In) loop + declare + S : constant Wide_Wide_String := + Decode (Get_Line (F_In)); + begin + Put_Line_UTF_8_Characters (S); + Put_Line (F_Out, Encode (S)); + end; + end loop; + + Close (F_In); + Close (F_Out); + end Read_Write_UTF_8_File; - Symbols_WWS : constant - Wide_Wide_String := - Decode (Symbols_UTF_16); - -- ^ Calling Decode for UTF_16_Wide_String - -- to Wide_Wide_String conversion. begin - Put_Line ("UTF_16_Wide_String: " - & Convert (Symbols_UTF_16)); - -- ^ Calling Convert for the - -- UTF_16_Wide_String to - -- UTF_8_String conversion. + Generate_UTF_8_File (File_Name_In); - Put_Line ("Wide_Wide_String: " - & Encode (Symbols_WWS)); - -- ^ Calling Encode for the - -- Wide_Wide_String to - -- UTF_8_String conversion. - end Show_WW_UTF16_String; + Read_Write_UTF_8_File + (Input_File_Name => File_Name_In, + Output_File_Name => File_Name_Out); + end Show_UTF_8; -In this example, we're calling the :ada:`Wide_Character'Val` function to -specify the UTF-16 BE code of the "♥" and "♫" symbols. We're then using -the :ada:`Decode` function to convert between the :ada:`UTF_16_Wide_String` and -the :ada:`Wide_Wide_String` types. +In the nested :ada:`Read_Write_UTF_8_File` procedure, we see both :ada:`Decode` +and :ada:`Encode` functions being called to convert from and to the +:ada:`UTF_8_String` type, respectively. + +.. admonition:: In the GNAT toolchain + + If we use the ``-gnatW8`` switch, which we mentioned + :ref:`in a previous section `, the implementation + of :ada:`Generate_UTF_8_File` and :ada:`Put_Line_UTF_8_Characters` must be + adapted. In addition, we can simplify the implementation of the + :ada:`Show_UTF_8` procedure, too. (Note, however, that the previous + implementation, which makes use of the :ada:`Decode` and :ada:`Encode` + functions, would work fine as well.) + + .. code:: ada run_button main=show_utf_8.adb project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_8_File_Processing switches=Compiler(-gnatW8); + + with Ada.Wide_Wide_Text_IO; + use Ada.Wide_Wide_Text_IO; + + procedure Put_Line_UTF_8_Characters + (WSS : Wide_Wide_String) + is + procedure Put_Complete_UTF_8_String + (WSS : Wide_Wide_String) + is + begin + Put_Line ("STRING: " & WSS); + Put_Line ("Length: " + & WSS'Length'Wide_Wide_Image + & " characters"); + New_Line; + end Put_Complete_UTF_8_String; + + procedure Put_UTF_8_Characters + (WSS : Wide_Wide_String) + is + begin + for I in WSS'Range loop + Put (I'Wide_Wide_Image & ": "); + Put (WSS (I)); + New_Line; + end loop; + end Put_UTF_8_Characters; + + begin + Put_Complete_UTF_8_String (WSS); + Put_UTF_8_Characters (WSS); + Put_Line ("--------------------"); + end Put_Line_UTF_8_Characters; + + with Ada.Wide_Wide_Text_IO; + use Ada.Wide_Wide_Text_IO; + + procedure Generate_UTF_8_File + (Output_File_Name : String) + is + F : File_Type; + begin + Create (F, Out_File, Output_File_Name); + Put_Line (F, "♥♫"); + Put_Line (F, "مرحبا يا عالم"); + Close (F); + end Generate_UTF_8_File; + + with Ada.Wide_Wide_Text_IO; + use Ada.Wide_Wide_Text_IO; + + with Generate_UTF_8_File; + with Put_Line_UTF_8_Characters; + + procedure Show_UTF_8 is + + File_Name_In : constant String := + "utf-8_test.txt"; + File_Name_Out : constant String := + "utf-8_copy.txt"; + + procedure Read_Write_UTF_8_File + (Input_File_Name, + Output_File_Name : String) + is + F_In, F_Out : File_Type; + begin + Open (F_In, In_File, Input_File_Name); + Create (F_Out, Out_File, Output_File_Name); + + while not End_Of_File (F_In) loop + declare + S : constant Wide_Wide_String := + Get_Line (F_In); + begin + Put_Line_UTF_8_Characters (S); + Put_Line (F_Out, S); + end; + end loop; + + Close (F_In); + Close (F_Out); + end Read_Write_UTF_8_File; + + begin + Generate_UTF_8_File (File_Name_In); + + Read_Write_UTF_8_File + (Input_File_Name => File_Name_In, + Output_File_Name => File_Name_Out); + end Show_UTF_8; + + In this version of the code, we've removed all references to the + :ada:`UTF_8_String` type |mdash| as well as the :ada:`Decode` and + :ada:`Encode` functions that we were using to convert from and to this + type. In this case, all UTF-8 processing happens directly using strings of + :ada:`Wide_Wide_Strings` type. .. _Adv_Ada_Image_Attribute: