From c4566390d31b6adcbd5cc3cb994f3d71386dba70 Mon Sep 17 00:00:00 2001 From: gusthoff Date: Wed, 19 Feb 2025 06:10:59 +0100 Subject: [PATCH 1/7] Editorial change: move section up Move section on UTF-16 encoding/decoding up. --- .../advanced-ada/parts/data_types/strings.rst | 290 +++++++++--------- 1 file changed, 145 insertions(+), 145 deletions(-) diff --git a/content/courses/advanced-ada/parts/data_types/strings.rst b/content/courses/advanced-ada/parts/data_types/strings.rst index 9e61479ea..896806e39 100644 --- a/content/courses/advanced-ada/parts/data_types/strings.rst +++ b/content/courses/advanced-ada/parts/data_types/strings.rst @@ -561,6 +561,151 @@ not the standard "x" from the - Complete section! +UTF-16 encoding and decoding +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +So far, we've discussed the UTF-8 encoding scheme. However, other encoding +schemes exist and are supported as well. In fact, the +:ada:`Ada.Strings.UTF_Encoding` package defines three encoding schemes: + +.. code-block:: ada + + type Encoding_Scheme is (UTF_8, + UTF_16BE, + UTF_16LE); + +For example, instead of using UTF-8 encoding, we can use UTF-16 encoding +|mdash| either in the big-endian or in the little-endian version. +To convert between UTF-8 and UTF-16 encoding schemes, we can make use of the +conversion functions from the :ada:`Ada.Strings.UTF_Encoding.Conversions` +package. + +To declare a UTF-16 encoded string, we can use one of the following data types: + +- the 8-bit-character based :ada:`UTF_String` type, or + +- the 16-bit-character based :ada:`UTF_16_Wide_String` type. + +When using the 8-bit version, though, we have to specify the input and output +schemes when converting between UTF-8 and UTF-16 encoding schemes. + +Let's see a code example that makes use of both :ada:`UTF_String` and +:ada:`UTF_16_Wide_String` types: + +.. code:: ada run_button project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_16_Types + + with Ada.Text_IO; use Ada.Text_IO; + + with Ada.Strings.UTF_Encoding; + use Ada.Strings.UTF_Encoding; + + with Ada.Strings.UTF_Encoding.Conversions; + use Ada.Strings.UTF_Encoding.Conversions; + + procedure Show_UTF16_Types is + Symbols_UTF_8 : constant + UTF_8_String := "♥♫"; + + Symbols_UTF_16 : constant + UTF_16_Wide_String := + Convert (Symbols_UTF_8); + -- ^ Calling Convert for UTF_8_String + -- to UTF_16_Wide_String conversion. + + Symbols_UTF_16BE : constant + UTF_String := + Convert (Item => Symbols_UTF_8, + Input_Scheme => UTF_8, + Output_Scheme => UTF_16BE); + -- ^ Calling Convert for UTF_8_String + -- to UTF_String conversion in UTF-16BE + -- encoding. + begin + Put_Line ("UTF_8_String: " + & Symbols_UTF_8); + + Put_Line ("UTF_16_Wide_String: " + & Convert (Symbols_UTF_16)); + -- ^ Calling Convert for + -- the UTF_16_Wide_String to + -- UTF_8_String conversion. + + Put_Line + ("UTF_String / UTF_16BE: " + & Convert + (Item => Symbols_UTF_16BE, + Input_Scheme => UTF_16BE, + Output_Scheme => UTF_8)); + end Show_UTF16_Types; + +In this example, we're declaring a UTF-8 encoded string and storing it in the +:ada:`Symbols_UTF_8` constant. Then, we're calling the :ada:`Convert` +functions to convert between UTF-8 and UTF-16 encoding schemes. We're using two +versions of this function: + +- the :ada:`Convert` function that returns an object of + :ada:`UTF_16_Wide_String` type for an input of :ada:`UTF_8_String` type, and + +- the :ada:`Convert` function that returns an object of :ada:`UTF_String` + type for an input of :ada:`UTF_8_String` type. + + - In this case, we need to specify the input and output schemes (see + :ada:`Input_Scheme` and :ada:`Output_Scheme` parameters in the code + example). + +Previously, we've seen that the +:ada:`Ada.Strings.UTF_Encoding.Wide_Wide_Strings` package offers functions to +convert between UTF-8 and the :ada:`Wide_Wide_String` type. The same kind of +conversion functions exist for UTF-16 strings as well. Let's look at this code +example: + +.. code:: ada run_button project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.WW_UTF_16_String + + with Ada.Text_IO; use Ada.Text_IO; + + with Ada.Strings.UTF_Encoding; + use Ada.Strings.UTF_Encoding; + + with Ada.Strings.UTF_Encoding.Wide_Wide_Strings; + use Ada.Strings.UTF_Encoding.Wide_Wide_Strings; + + with Ada.Strings.UTF_Encoding.Conversions; + use Ada.Strings.UTF_Encoding.Conversions; + + procedure Show_WW_UTF16_String is + Symbols_UTF_16 : constant + UTF_16_Wide_String := + Wide_Character'Val (16#2665#) & + Wide_Character'Val (16#266B#); + -- ^ Calling Wide_Character'Val + -- to specify the UTF-16 BE code + -- for "♥" and "♫". + + Symbols_WWS : constant + Wide_Wide_String := + Decode (Symbols_UTF_16); + -- ^ Calling Decode for UTF_16_Wide_String + -- to Wide_Wide_String conversion. + begin + Put_Line ("UTF_16_Wide_String: " + & Convert (Symbols_UTF_16)); + -- ^ Calling Convert for the + -- UTF_16_Wide_String to + -- UTF_8_String conversion. + + Put_Line ("Wide_Wide_String: " + & Encode (Symbols_WWS)); + -- ^ Calling Encode for the + -- Wide_Wide_String to + -- UTF_8_String conversion. + end Show_WW_UTF16_String; + +In this example, we're calling the :ada:`Wide_Character'Val` function to +specify the UTF-16 BE code of the "♥" and "♫" symbols. We're then using +the :ada:`Decode` function to convert between the :ada:`UTF_16_Wide_String` and +the :ada:`Wide_Wide_String` types. + + UTF-8 encoding in source-code files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -773,151 +918,6 @@ Here, we use a sequence of three calls to the :ada:`Character'Val(code)` function for the UTF-8 code that corresponds to the "★" symbol. -UTF-16 encoding and decoding -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -So far, we've discussed the UTF-8 encoding scheme. However, other encoding -schemes exist and are supported as well. In fact, the -:ada:`Ada.Strings.UTF_Encoding` package defines three encoding schemes: - -.. code-block:: ada - - type Encoding_Scheme is (UTF_8, - UTF_16BE, - UTF_16LE); - -For example, instead of using UTF-8 encoding, we can use UTF-16 encoding -|mdash| either in the big-endian or in the little-endian version. -To convert between UTF-8 and UTF-16 encoding schemes, we can make use of the -conversion functions from the :ada:`Ada.Strings.UTF_Encoding.Conversions` -package. - -To declare a UTF-16 encoded string, we can use one of the following data types: - -- the 8-bit-character based :ada:`UTF_String` type, or - -- the 16-bit-character based :ada:`UTF_16_Wide_String` type. - -When using the 8-bit version, though, we have to specify the input and output -schemes when converting between UTF-8 and UTF-16 encoding schemes. - -Let's see a code example that makes use of both :ada:`UTF_String` and -:ada:`UTF_16_Wide_String` types: - -.. code:: ada run_button project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_16_Types - - with Ada.Text_IO; use Ada.Text_IO; - - with Ada.Strings.UTF_Encoding; - use Ada.Strings.UTF_Encoding; - - with Ada.Strings.UTF_Encoding.Conversions; - use Ada.Strings.UTF_Encoding.Conversions; - - procedure Show_UTF16_Types is - Symbols_UTF_8 : constant - UTF_8_String := "♥♫"; - - Symbols_UTF_16 : constant - UTF_16_Wide_String := - Convert (Symbols_UTF_8); - -- ^ Calling Convert for UTF_8_String - -- to UTF_16_Wide_String conversion. - - Symbols_UTF_16BE : constant - UTF_String := - Convert (Item => Symbols_UTF_8, - Input_Scheme => UTF_8, - Output_Scheme => UTF_16BE); - -- ^ Calling Convert for UTF_8_String - -- to UTF_String conversion in UTF-16BE - -- encoding. - begin - Put_Line ("UTF_8_String: " - & Symbols_UTF_8); - - Put_Line ("UTF_16_Wide_String: " - & Convert (Symbols_UTF_16)); - -- ^ Calling Convert for - -- the UTF_16_Wide_String to - -- UTF_8_String conversion. - - Put_Line - ("UTF_String / UTF_16BE: " - & Convert - (Item => Symbols_UTF_16BE, - Input_Scheme => UTF_16BE, - Output_Scheme => UTF_8)); - end Show_UTF16_Types; - -In this example, we're declaring a UTF-8 encoded string and storing it in the -:ada:`Symbols_UTF_8` constant. Then, we're calling the :ada:`Convert` -functions to convert between UTF-8 and UTF-16 encoding schemes. We're using two -versions of this function: - -- the :ada:`Convert` function that returns an object of - :ada:`UTF_16_Wide_String` type for an input of :ada:`UTF_8_String` type, and - -- the :ada:`Convert` function that returns an object of :ada:`UTF_String` - type for an input of :ada:`UTF_8_String` type. - - - In this case, we need to specify the input and output schemes (see - :ada:`Input_Scheme` and :ada:`Output_Scheme` parameters in the code - example). - -Previously, we've seen that the -:ada:`Ada.Strings.UTF_Encoding.Wide_Wide_Strings` package offers functions to -convert between UTF-8 and the :ada:`Wide_Wide_String` type. The same kind of -conversion functions exist for UTF-16 strings as well. Let's look at this code -example: - -.. code:: ada run_button project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.WW_UTF_16_String - - with Ada.Text_IO; use Ada.Text_IO; - - with Ada.Strings.UTF_Encoding; - use Ada.Strings.UTF_Encoding; - - with Ada.Strings.UTF_Encoding.Wide_Wide_Strings; - use Ada.Strings.UTF_Encoding.Wide_Wide_Strings; - - with Ada.Strings.UTF_Encoding.Conversions; - use Ada.Strings.UTF_Encoding.Conversions; - - procedure Show_WW_UTF16_String is - Symbols_UTF_16 : constant - UTF_16_Wide_String := - Wide_Character'Val (16#2665#) & - Wide_Character'Val (16#266B#); - -- ^ Calling Wide_Character'Val - -- to specify the UTF-16 BE code - -- for "♥" and "♫". - - Symbols_WWS : constant - Wide_Wide_String := - Decode (Symbols_UTF_16); - -- ^ Calling Decode for UTF_16_Wide_String - -- to Wide_Wide_String conversion. - begin - Put_Line ("UTF_16_Wide_String: " - & Convert (Symbols_UTF_16)); - -- ^ Calling Convert for the - -- UTF_16_Wide_String to - -- UTF_8_String conversion. - - Put_Line ("Wide_Wide_String: " - & Encode (Symbols_WWS)); - -- ^ Calling Encode for the - -- Wide_Wide_String to - -- UTF_8_String conversion. - end Show_WW_UTF16_String; - -In this example, we're calling the :ada:`Wide_Character'Val` function to -specify the UTF-16 BE code of the "♥" and "♫" symbols. We're then using -the :ada:`Decode` function to convert between the :ada:`UTF_16_Wide_String` and -the :ada:`Wide_Wide_String` types. - - .. _Adv_Ada_Image_Attribute: Image attribute From df0193a67b9f3da5a7497399973e90e0938db615 Mon Sep 17 00:00:00 2001 From: gusthoff Date: Wed, 19 Feb 2025 06:11:57 +0100 Subject: [PATCH 2/7] Editorial change: split UTF-8 applications into new section --- content/courses/advanced-ada/parts/data_types/strings.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/content/courses/advanced-ada/parts/data_types/strings.rst b/content/courses/advanced-ada/parts/data_types/strings.rst index 896806e39..003a99b4f 100644 --- a/content/courses/advanced-ada/parts/data_types/strings.rst +++ b/content/courses/advanced-ada/parts/data_types/strings.rst @@ -706,6 +706,13 @@ the :ada:`Decode` function to convert between the :ada:`UTF_16_Wide_String` and the :ada:`Wide_Wide_String` types. +UTF-8 applications +------------------ + +In this section, we take a further look into UTF-8 encoding and some real-world +applications. First, we discuss the use of UTF-8 encoding in source-code files. +Then, we talk about parsing UTF-8 files using *wide-wide* strings. + UTF-8 encoding in source-code files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 59a798108ed0888525599b9b3022efaeb8d156d5 Mon Sep 17 00:00:00 2001 From: gusthoff Date: Wed, 19 Feb 2025 06:12:12 +0100 Subject: [PATCH 3/7] Editorial change: add anchor --- content/courses/advanced-ada/parts/data_types/strings.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/content/courses/advanced-ada/parts/data_types/strings.rst b/content/courses/advanced-ada/parts/data_types/strings.rst index 003a99b4f..c63d2797d 100644 --- a/content/courses/advanced-ada/parts/data_types/strings.rst +++ b/content/courses/advanced-ada/parts/data_types/strings.rst @@ -804,6 +804,8 @@ Otherwise, we might get unexpected behavior. (Interpreting the characters in UTF-8 format as Latin-1 format is certainly an example of what we want to avoid here.) +.. _Adv_Ada_GNAT_W8_Switch: + .. admonition:: In the GNAT toolchain You can use UTF-8 coding in your source-code file and initialize strings of From 9675c722f8e45af586ea2abd1d1b90987f9f596e Mon Sep 17 00:00:00 2001 From: gusthoff Date: Wed, 19 Feb 2025 06:14:09 +0100 Subject: [PATCH 4/7] Editorial change: move todo item to new section --- .../advanced-ada/parts/data_types/strings.rst | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/content/courses/advanced-ada/parts/data_types/strings.rst b/content/courses/advanced-ada/parts/data_types/strings.rst index c63d2797d..4ca2e595b 100644 --- a/content/courses/advanced-ada/parts/data_types/strings.rst +++ b/content/courses/advanced-ada/parts/data_types/strings.rst @@ -550,17 +550,6 @@ not the standard "x" from the :wikipedia:`Basic Latin block `.) -.. - TO BE DONE: - - Parsing UTF-8 files for Wide-Wide-String processing - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - .. todo:: - - - Complete section! - - UTF-16 encoding and decoding ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -927,6 +916,17 @@ Here, we use a sequence of three calls to the :ada:`Character'Val(code)` function for the UTF-8 code that corresponds to the "★" symbol. +.. + TO BE DONE: + + Parsing UTF-8 files for Wide-Wide-String processing + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + .. todo:: + + - Complete section! + + .. _Adv_Ada_Image_Attribute: Image attribute From 3c6906b266685abebbde4d14ecc3f62b74c186a9 Mon Sep 17 00:00:00 2001 From: gusthoff Date: Wed, 19 Feb 2025 06:16:42 +0100 Subject: [PATCH 5/7] Editorial change: remove todo item --- .../courses/advanced-ada/parts/data_types/strings.rst | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/content/courses/advanced-ada/parts/data_types/strings.rst b/content/courses/advanced-ada/parts/data_types/strings.rst index 4ca2e595b..1fa28fc6d 100644 --- a/content/courses/advanced-ada/parts/data_types/strings.rst +++ b/content/courses/advanced-ada/parts/data_types/strings.rst @@ -916,15 +916,8 @@ Here, we use a sequence of three calls to the :ada:`Character'Val(code)` function for the UTF-8 code that corresponds to the "★" symbol. -.. - TO BE DONE: - - Parsing UTF-8 files for Wide-Wide-String processing - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - .. todo:: - - - Complete section! +Parsing UTF-8 files for Wide-Wide-String processing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. _Adv_Ada_Image_Attribute: From 1082cfb476b8e816eb7572cb5f24671e3560db51 Mon Sep 17 00:00:00 2001 From: gusthoff Date: Wed, 19 Feb 2025 06:17:24 +0100 Subject: [PATCH 6/7] Editorial change: adding anchor --- content/courses/advanced-ada/parts/data_types/strings.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/content/courses/advanced-ada/parts/data_types/strings.rst b/content/courses/advanced-ada/parts/data_types/strings.rst index 1fa28fc6d..d9a24ad66 100644 --- a/content/courses/advanced-ada/parts/data_types/strings.rst +++ b/content/courses/advanced-ada/parts/data_types/strings.rst @@ -916,6 +916,8 @@ Here, we use a sequence of three calls to the :ada:`Character'Val(code)` function for the UTF-8 code that corresponds to the "★" symbol. +.. _Adv_Ada_UTF_8_Files_Wide_Wide_Strings: + Parsing UTF-8 files for Wide-Wide-String processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 7750d9d1abab6c79501dd756d626742320b3de4b Mon Sep 17 00:00:00 2001 From: gusthoff Date: Wed, 19 Feb 2025 06:18:40 +0100 Subject: [PATCH 7/7] Adding section on parsing UTF-8 files for Wide-Wide-String processing --- .../advanced-ada/parts/data_types/strings.rst | 311 ++++++++++++++++++ 1 file changed, 311 insertions(+) diff --git a/content/courses/advanced-ada/parts/data_types/strings.rst b/content/courses/advanced-ada/parts/data_types/strings.rst index d9a24ad66..c937b1c40 100644 --- a/content/courses/advanced-ada/parts/data_types/strings.rst +++ b/content/courses/advanced-ada/parts/data_types/strings.rst @@ -921,6 +921,317 @@ function for the UTF-8 code that corresponds to the "★" symbol. Parsing UTF-8 files for Wide-Wide-String processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +A typical use-case is to parse a text file in UTF-8 format and use *wide-wide* +strings to process the lines of that file. Before we look at the implementation +that does that, let's first write a procedure that generate a text file in +UTF-8 format: + +.. code:: ada no_button project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_8_File_Processing + + with Ada.Text_IO; use Ada.Text_IO; + + with Ada.Strings.UTF_Encoding; + use Ada.Strings.UTF_Encoding; + + procedure Generate_UTF_8_File + (Output_File_Name : String) + is + F : File_Type; + begin + Create (F, Out_File, Output_File_Name); + Put_Line (F, UTF_8_String'("♥♫")); + Put_Line + (F, + UTF_8_String'("مرحبا يا عالم")); + Close (F); + end Generate_UTF_8_File; + +Procedure :ada:`Generate_UTF_8_File` writes two strings with non-Latin +characters into the UTF-8 file indicated by the :ada:`Output_File_Name` +parameter. + +In addition, let's implement an auxiliary procedure to display the individual +characters of a *wide-wide* string: + +.. code:: ada no_button project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_8_File_Processing + + with Ada.Text_IO; use Ada.Text_IO; + + with Ada.Strings.UTF_Encoding; + use Ada.Strings.UTF_Encoding; + + with Ada.Strings.UTF_Encoding.Wide_Wide_Strings; + use Ada.Strings.UTF_Encoding.Wide_Wide_Strings; + + procedure Put_Line_UTF_8_Characters + (WSS : Wide_Wide_String) + is + procedure Put_Complete_UTF_8_String + (WSS : Wide_Wide_String) + is + S_UTF_8 : constant UTF_8_String := + Encode (WSS); + begin + Put_Line ("STRING: " & S_UTF_8); + Put_Line ("Length: " + & WSS'Length'Image + & " characters"); + New_Line; + end Put_Complete_UTF_8_String; + + -- This is a wrapper function of the + -- Encode function for the + -- Wide_Wide_Character type: + function Encode (Item : Wide_Wide_Character) + return UTF_8_String + is + SC : constant Wide_Wide_String (1 .. 1) + := (1 => Item); + -- We need a 1-character string + -- for the call to Encode. + begin + return Encode (SC); + end Encode; + + procedure Put_UTF_8_Characters + (WSS : Wide_Wide_String) is + begin + for I in WSS'Range loop + Put (I'Image & ": "); + Put (Encode (WSS (I))); + New_Line; + end loop; + end Put_UTF_8_Characters; + + begin + Put_Complete_UTF_8_String (WSS); + Put_UTF_8_Characters (WSS); + Put_Line ("--------------------"); + end Put_Line_UTF_8_Characters; + +Finally, let's look at a code example that parses an UTF-8 file: + +.. code:: ada run_button main=show_utf_8.adb project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_8_File_Processing + + with Ada.Text_IO; use Ada.Text_IO; + + with Ada.Strings.UTF_Encoding; + use Ada.Strings.UTF_Encoding; + + with Ada.Strings.UTF_Encoding.Wide_Wide_Strings; + use Ada.Strings.UTF_Encoding.Wide_Wide_Strings; + + with Generate_UTF_8_File; + with Put_Line_UTF_8_Characters; + + procedure Show_UTF_8 is + + File_Name : constant String := + "utf-8_test.txt"; + + procedure Read_UTF_8_File + (Input_File_Name : String) + is + F : File_Type; + begin + Open (F, In_File, Input_File_Name); + + while not End_Of_File (F) loop + declare + S_UTF8 : constant UTF_8_String + := Get_Line (F); + S : constant Wide_Wide_String + := Decode (S_UTF8); + begin + Put_Line_UTF_8_Characters (S); + end; + end loop; + Close (F); + end Read_UTF_8_File; + + begin + Generate_UTF_8_File (File_Name); + Read_UTF_8_File (File_Name); + end Show_UTF_8; + +The :ada:`Show_UTF_8` procedure first calls the :ada:`Generate_UTF_8_File` +procedure to generate a text file in UTF-8 format, and then calls the nested +:ada:`Read_UTF_8_File` procedure to read from that file |mdash| this is done by +reading the 8-bit UTF-8 encoded string and decoding it into a string of +:ada:`Wide_Wide_String` type. + +(Note that we call the auxiliary :ada:`Put_Line_UTF_8_Characters` procedure to +display the characters of each line we read from the UTF-8 file.) + +For completeness, we include the nested :ada:`Read_Write_UTF_8_File` procedure, +which not only reads each line from a UTF-8 file, but also writes it into +another UTF-8 file: + +.. code:: ada run_button main=show_utf_8.adb project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_8_File_Processing + + with Ada.Text_IO; use Ada.Text_IO; + + with Ada.Strings.UTF_Encoding; + use Ada.Strings.UTF_Encoding; + + with Ada.Strings.UTF_Encoding.Wide_Wide_Strings; + use Ada.Strings.UTF_Encoding.Wide_Wide_Strings; + + with Generate_UTF_8_File; + with Put_Line_UTF_8_Characters; + + procedure Show_UTF_8 is + + File_Name_In : constant String := + "utf-8_test.txt"; + File_Name_Out : constant String := + "utf-8_copy.txt"; + + procedure Read_Write_UTF_8_File + (Input_File_Name, + Output_File_Name : String) + is + F_In, F_Out : File_Type; + begin + Open (F_In, In_File, Input_File_Name); + Create (F_Out, Out_File, Output_File_Name); + + while not End_Of_File (F_In) loop + declare + S : constant Wide_Wide_String := + Decode (Get_Line (F_In)); + begin + Put_Line_UTF_8_Characters (S); + Put_Line (F_Out, Encode (S)); + end; + end loop; + + Close (F_In); + Close (F_Out); + end Read_Write_UTF_8_File; + + begin + Generate_UTF_8_File (File_Name_In); + + Read_Write_UTF_8_File + (Input_File_Name => File_Name_In, + Output_File_Name => File_Name_Out); + end Show_UTF_8; + +In the nested :ada:`Read_Write_UTF_8_File` procedure, we see both :ada:`Decode` +and :ada:`Encode` functions being called to convert from and to the +:ada:`UTF_8_String` type, respectively. + +.. admonition:: In the GNAT toolchain + + If we use the ``-gnatW8`` switch, which we mentioned + :ref:`in a previous section `, the implementation + of :ada:`Generate_UTF_8_File` and :ada:`Put_Line_UTF_8_Characters` must be + adapted. In addition, we can simplify the implementation of the + :ada:`Show_UTF_8` procedure, too. (Note, however, that the previous + implementation, which makes use of the :ada:`Decode` and :ada:`Encode` + functions, would work fine as well.) + + .. code:: ada run_button main=show_utf_8.adb project=Courses.Advanced_Ada.Data_Types.Strings.String_Encoding.UTF_8_File_Processing switches=Compiler(-gnatW8); + + with Ada.Wide_Wide_Text_IO; + use Ada.Wide_Wide_Text_IO; + + procedure Put_Line_UTF_8_Characters + (WSS : Wide_Wide_String) + is + procedure Put_Complete_UTF_8_String + (WSS : Wide_Wide_String) + is + begin + Put_Line ("STRING: " & WSS); + Put_Line ("Length: " + & WSS'Length'Wide_Wide_Image + & " characters"); + New_Line; + end Put_Complete_UTF_8_String; + + procedure Put_UTF_8_Characters + (WSS : Wide_Wide_String) + is + begin + for I in WSS'Range loop + Put (I'Wide_Wide_Image & ": "); + Put (WSS (I)); + New_Line; + end loop; + end Put_UTF_8_Characters; + + begin + Put_Complete_UTF_8_String (WSS); + Put_UTF_8_Characters (WSS); + Put_Line ("--------------------"); + end Put_Line_UTF_8_Characters; + + with Ada.Wide_Wide_Text_IO; + use Ada.Wide_Wide_Text_IO; + + procedure Generate_UTF_8_File + (Output_File_Name : String) + is + F : File_Type; + begin + Create (F, Out_File, Output_File_Name); + Put_Line (F, "♥♫"); + Put_Line (F, "مرحبا يا عالم"); + Close (F); + end Generate_UTF_8_File; + + with Ada.Wide_Wide_Text_IO; + use Ada.Wide_Wide_Text_IO; + + with Generate_UTF_8_File; + with Put_Line_UTF_8_Characters; + + procedure Show_UTF_8 is + + File_Name_In : constant String := + "utf-8_test.txt"; + File_Name_Out : constant String := + "utf-8_copy.txt"; + + procedure Read_Write_UTF_8_File + (Input_File_Name, + Output_File_Name : String) + is + F_In, F_Out : File_Type; + begin + Open (F_In, In_File, Input_File_Name); + Create (F_Out, Out_File, Output_File_Name); + + while not End_Of_File (F_In) loop + declare + S : constant Wide_Wide_String := + Get_Line (F_In); + begin + Put_Line_UTF_8_Characters (S); + Put_Line (F_Out, S); + end; + end loop; + + Close (F_In); + Close (F_Out); + end Read_Write_UTF_8_File; + + begin + Generate_UTF_8_File (File_Name_In); + + Read_Write_UTF_8_File + (Input_File_Name => File_Name_In, + Output_File_Name => File_Name_Out); + end Show_UTF_8; + + In this version of the code, we've removed all references to the + :ada:`UTF_8_String` type |mdash| as well as the :ada:`Decode` and + :ada:`Encode` functions that we were using to convert from and to this + type. In this case, all UTF-8 processing happens directly using strings of + :ada:`Wide_Wide_Strings` type. + .. _Adv_Ada_Image_Attribute: