add iso8859_1 in ID3 header

schreibfaul1 · schreibfaul1 · commit b02ba826a793 · 2025-07-28T15:15:45.000+02:00
diff --git a/src/Audio.cpp b/src/Audio.cpp
@@ -3,7 +3,7 @@
     audio.cpp
 
     Created on: Oct 28.2018                                                                                                  */char audioI2SVers[] ="\
-    Version 3.4.0i                                                                                                                                ";
+    Version 3.4.0j                                                                                                                                ";
 /*  Updated on: Jul 28.2025
 
     Author: Wolle (schreibfaul1)
@@ -1877,6 +1877,7 @@ int Audio::read_ID3_Header(uint8_t* data, size_t len) {
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     if(m_controlCounter == 6) { // Read the value
         m_controlCounter = 5;   // only read 256 bytes
+
         uint8_t textEncodingByte = *(data + 0);  // ID3v2 Text-Encoding-Byte
         // $00 – ISO-8859-1 (LATIN-1, Identical to ASCII for values smaller than 0x80).
         // $01 – UCS-2 encoded Unicode with BOM (Byte Order Mark), in ID3v2.2 and ID3v2.3.
@@ -1897,6 +1898,7 @@ int Audio::read_ID3_Header(uint8_t* data, size_t len) {
             return 0;
         }
 
+
         if( // proprietary not standard information
             startsWith(m_ID3Hdr.tag, "PRIV")) {
                 ;//AUDIO_LOG_ERROR("PRIV");
@@ -1905,78 +1907,51 @@ int Audio::read_ID3_Header(uint8_t* data, size_t len) {
 
         if(m_ID3Hdr.framesize == 0) return 0;
 
+        ps_ptr<char> tmp;
         size_t fs = m_ID3Hdr.framesize; // fs = size of the frame data field as read from header
         size_t bytesToCopy = fs;
-
-        if (bytesToCopy >= m_ID3Hdr.iBuffSize) { // <= oder >= hier ist wichtig!
-            bytesToCopy = m_ID3Hdr.iBuffSize - 1; // Sicherstellen, dass ein Null-Terminator passt
-        }
         size_t textDataLength = 0;
-        if (bytesToCopy > 0) { // Nur wenn überhaupt Daten da sind, die wir kürzen können
-            textDataLength = bytesToCopy - 1; // Dies ist die Anzahl der zu kopierenden TEXT-Bytes
+
+        if (bytesToCopy >= m_ID3Hdr.iBuffSize) { bytesToCopy = m_ID3Hdr.iBuffSize - 1;} // make sure a zero terminator fits
+        if (bytesToCopy > 0) { textDataLength = bytesToCopy - 1;}                       // Only if there are data that we can shorten
+        for (int i = 0; i < textDataLength; i++) {
+            m_ID3Hdr.iBuff[i] = *(data + i + 1);                                        // Skipped the first byte (Encoding)
         }
-        for(int i = 0; i < textDataLength; i++) {
-            m_ID3Hdr.iBuff[i] = *(data + i + 1); // Überspringt das erste Byte (Encoding)
+
+        if (textEncodingByte == 1 || textEncodingByte == 2) {                           // is UTF-16LE or UTF-16BE
+            m_ID3Hdr.iBuff[textDataLength] = 0;                                         // UTF-16: set double zero terminator
+            m_ID3Hdr.iBuff[textDataLength + 1] = 0;                                     // second '\0' for UTF-16
+        } else {
+            m_ID3Hdr.iBuff[textDataLength] = 0;                                         // only one '\0' for ISO-8859-1 or UTF-8
         }
-        m_ID3Hdr.iBuff[textDataLength] = 0;
+
         m_ID3Hdr.framesize -= fs;
         m_ID3Hdr.remainingHeaderBytes -= fs;
         uint16_t dataLength = fs - 1;
 
-        if(textEncodingByte == 0){  // latin
-            latinToUTF8(m_ID3Hdr.iBuff, false);
-            showID3Tag(m_ID3Hdr.tag, m_ID3Hdr.iBuff.get());
-        }
-
-        if(textEncodingByte == 1  && dataLength > 1) { // UTF16 with BOM
-            int8_t data_start = 0;
-            if(startsWith(m_ID3Hdr.tag, "COMM")){ // language code
-                m_ID3Hdr.lang[0] = m_ID3Hdr.iBuff[0];
-                m_ID3Hdr.lang[1] = m_ID3Hdr.iBuff[1];
-                m_ID3Hdr.lang[2] = m_ID3Hdr.iBuff[2];
-                m_ID3Hdr.lang[3] = '\0';
-                data_start += 3;
-                // log_w("language code: %s", m_ID3Hdr.lang);
-                m_ID3Hdr.byteOrderMark = static_cast<unsigned char>(m_ID3Hdr.iBuff[data_start]) == 0xFE && static_cast<unsigned char>(m_ID3Hdr.iBuff[data_start]) == 0xFF;
-                data_start += 2;
-                m_ID3Hdr.contentDescriptorTerminator_0 = m_ID3Hdr.iBuff[data_start];
-                m_ID3Hdr.contentDescriptorTerminator_1 = m_ID3Hdr.iBuff[data_start + 1];
-                m_ID3Hdr.textStringTerminator_0        = m_ID3Hdr.iBuff[data_start + 2];
-                m_ID3Hdr.textStringTerminator_1        = m_ID3Hdr.iBuff[data_start + 3];
-                data_start += 4;
-            }
-            else{
-                m_ID3Hdr.byteOrderMark = static_cast<unsigned char>(m_ID3Hdr.iBuff[data_start]) == 0xFE && static_cast<unsigned char>(m_ID3Hdr.iBuff[data_start]) == 0xFF;
-                data_start += 2;
-            }
-
-            std::u16string utf16_string;
-            for (size_t i = data_start; i < dataLength; i += 2) {
-                char16_t wchar;
-                if(m_ID3Hdr.byteOrderMark)  wchar = (static_cast<unsigned char>(m_ID3Hdr.iBuff[i]) << 8) | static_cast<unsigned char>(m_ID3Hdr.iBuff[i + 1]);
-                else                      wchar = (static_cast<unsigned char>(m_ID3Hdr.iBuff[i + 1]) << 8) | static_cast<unsigned char>(m_ID3Hdr.iBuff[i]);
-                utf16_string.push_back(wchar);
-            }
-
-            std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> converter;
-            showID3Tag(m_ID3Hdr.tag, converter.to_bytes(utf16_string).c_str());
+        if(startsWith(m_ID3Hdr.tag, "COMM")){ // language code
+            m_ID3Hdr.lang[0] = m_ID3Hdr.iBuff[0];
+            m_ID3Hdr.lang[1] = m_ID3Hdr.iBuff[1];
+            m_ID3Hdr.lang[2] = m_ID3Hdr.iBuff[2];
+            m_ID3Hdr.lang[3] = '\0';
+            m_ID3Hdr.iBuff.shift_left(4);
         }
 
-        if(textEncodingByte == 2 && dataLength > 1) { // UTF16BE
-
-            std::u16string utf16_string;
-            for (size_t i = 0; i < dataLength; i += 2) {
-                char16_t  wchar = (static_cast<unsigned char>(m_ID3Hdr.iBuff[i]) << 8) | static_cast<unsigned char>(m_ID3Hdr.iBuff[i + 1]);
-                utf16_string.push_back(wchar);
-            }
+        char encodingTab [4][12] = {"ISO-8859-1", "UTF-16", "UTF-16BE", "UTF-8"};
+        // AUDIO_LOG_INFO("Tag: %s, Length: %i, Format: %s", m_ID3Hdr.tag, textDataLength, encodingTab[textEncodingByte]);
 
-            std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> converter;
-            showID3Tag(m_ID3Hdr.tag, converter.to_bytes(utf16_string).c_str());
+        if (textEncodingByte == 0) { // ISO-8859-1
+            tmp.copy_from_iso8859_1((const uint8_t*)m_ID3Hdr.iBuff.get(), "tmp");
+        } else if (textEncodingByte == 1 || textEncodingByte == 2) { // UTF-16LE oder UTF-16BE
+            bool isBigEndian = (textEncodingByte == 2);
+            tmp.copy_from_utf16((const uint8_t*)m_ID3Hdr.iBuff.get(), isBigEndian, "tmp");
+        } else if (textEncodingByte == 3) { // UTF-8
+            // Direkt kopieren, da keine Konvertierung nötig ist
+            tmp.copy_from(m_ID3Hdr.iBuff.get());
         }
 
-        if(textEncodingByte == 3) { // utf8
-            showID3Tag(m_ID3Hdr.tag, m_ID3Hdr.iBuff.get());
-        }
+        showID3Tag(m_ID3Hdr.tag, tmp.c_get());
+
         return fs;
     }
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -2009,7 +1984,6 @@ int Audio::read_ID3_Header(uint8_t* data, size_t len) {
                 if(!res){AUDIO_LOG_ERROR("http range request was not successful"); return 0;}
                 res = parseHttpRangeHeader();
                 if(!res){AUDIO_LOG_ERROR("http range response was not successful"); return 0;}
-            //    return 0;
             }
             if(m_dataMode == AUDIO_LOCALFILE){
                 uint32_t pos = m_audiofile.position();
@@ -2031,22 +2005,17 @@ int Audio::read_ID3_Header(uint8_t* data, size_t len) {
             m_ID3Hdr.SYLT.time_stamp_format =  syltBuff[4];
             m_ID3Hdr.SYLT.content_type =       syltBuff[5];
             idx = 6;
-            if(m_ID3Hdr.SYLT.text_encoding == 0 || m_ID3Hdr.SYLT.text_encoding == 3){ // utf-8
-                len = content_descriptor.copy_from((const char*)(syltBuff.get() + idx), "content_descriptor");
-            }
-            else{ // utf-16
-                len = content_descriptor.copy_from_utf16((const uint8_t*)(syltBuff.get() + idx), isBigEndian, "content_descriptor");
-            }
+            if     (m_ID3Hdr.SYLT.text_encoding == 0) len = 1 + content_descriptor.copy_from_iso8859_1((const uint8_t*)(syltBuff.get() + idx), "content_descriptor");         // iso8859_1
+            else if(m_ID3Hdr.SYLT.text_encoding == 3) len = 1 + content_descriptor.copy_from((const char*)(syltBuff.get() + idx), "content_descriptor");                      // utf-8
+            else                                      len = 2 + content_descriptor.copy_from_utf16((const uint8_t*)(syltBuff.get() + idx), isBigEndian, "content_descriptor");// utf-16
             if(len > 2) AUDIO_INFO("Lyrics: content_descriptor: %s", content_descriptor.c_get());
+
             idx += len;
             while (idx < m_ID3Hdr.SYLT.size) {
-                    // UTF-16LE, UTF-16BE
-                if (m_ID3Hdr.SYLT.text_encoding == 1 || m_ID3Hdr.SYLT.text_encoding == 2) {
-                    idx += tmp.copy_from_utf16((const uint8_t*)(syltBuff.get() + idx), isBigEndian, "sylt-text");
-                } else {
-                    // ISO-8859-1 / UTF-8
-                    idx += tmp.copy_from((const char*)syltBuff.get() + idx, "sylt-text");
-                }
+                if      (m_ID3Hdr.SYLT.text_encoding == 0) idx += 1 + tmp.copy_from_iso8859_1((const uint8_t*)syltBuff.get() + idx, "sylt-text");            // ISO8859_1
+                else if (m_ID3Hdr.SYLT.text_encoding == 3) idx += 1 + tmp.copy_from((const char*)syltBuff.get() + idx, "sylt-text");                         // UTF-8
+                else                                       idx += 2 + tmp.copy_from_utf16((const uint8_t*)(syltBuff.get() + idx), isBigEndian, "sylt-text"); // UTF-16LE, UTF-16BE
+
                 if (tmp.starts_with("\n")) tmp.remove_before(1);
                 m_syltLines.push_back(std::move(tmp));
                 if (idx + 4 > m_ID3Hdr.SYLT.size) break; // no more 4 bytes?
@@ -2061,13 +2030,9 @@ int Audio::read_ID3_Header(uint8_t* data, size_t len) {
         return 0;
     }
 
-
-
-
-
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-    // --- section V2.2 only , greater Vers above ----
+    // --- section V2.2 only , higher Vers above ----
     // see https://mutagen-specs.readthedocs.io/en/latest/id3/id3v2.2.html
     if(m_controlCounter == 10) { // frames in V2.2, 3bytes identifier, 3bytes size descriptor
 
diff --git a/src/psram_unique_ptr.hpp b/src/psram_unique_ptr.hpp
@@ -299,48 +299,133 @@ class ps_ptr {
     // is UTF-16LE and will converted to: "Little London Girl"
     // UTF-16LE and UTF-16BE is often found in ID3 header
 
-    size_t copy_from_utf16(const uint8_t* src, bool is_big_endian = false, const char* name = nullptr) {
-        if (!src) { log_e("arg. is null"); return 0; }
-        std::vector<char> out;
-        size_t i = 0;
+#include <vector>
+#include <cstdint>
+#include <cstring>
+#include <stdexcept>
 
-        if(is_big_endian == false){ // maybe we have a BOM
-            if (src[i] == 0xFF && src[i + 1] == 0xFE) {
-                is_big_endian = false; // UTF-16 Little Endian
-                i += 2;  // skip byte order mark + \0\0
-            } else if (src[i] == 0xFE && src[i + 1] == 0xFF) {
-                is_big_endian = true;  // UTF-16 Big Endian
-                i += 2;  // skip byte order mark + \0\0
-            } else {
-                // BOM is missing or invalid
-            }
+// convert UTF-16 to UTF-8 and stop at zero terminator
+size_t copy_from_utf16(const uint8_t* src, bool is_big_endian = false, const char* name = nullptr) {
+    if (!src) {
+        log_e("arg. is null");
+        return 0;
+    }
+    std::vector<char> out;
+    size_t i = 0;
+
+    // BOM-Handling
+    if (src[i] == 0xFF && src[i + 1] == 0xFE) {
+        is_big_endian = false; // UTF-16LE
+        i += 2;
+    } else if (src[i] == 0xFE && src[i + 1] == 0xFF) {
+        is_big_endian = true;  // UTF-16BE
+        i += 2;
+    }
+
+    while (true) {
+        // Prüfe, ob genug Bytes für ein UTF-16-Zeichen vorhanden sind
+        if (i + 1 >= std::numeric_limits<size_t>::max() || (src[i] == 0x00 && src[i + 1] == 0x00)) {
+            break; // Nullterminator oder Ende des Puffers
         }
 
-        while (true) {
-            uint16_t ch;
+        uint16_t ch;
+        if (is_big_endian) {
+            ch = (src[i] << 8) | src[i + 1];
+        } else {
+            ch = (src[i + 1] << 8) | src[i];
+        }
+        i += 2;
+
+        uint32_t codepoint = ch;
+
+        // Prüfe auf Surrogatenpaare
+        if (ch >= 0xD800 && ch <= 0xDBFF) { // High surrogate
+            if ((i + 1 >= std::numeric_limits<size_t>::max()) || (src[i] == 0x00 && src[i + 1] == 0x00)) {
+                log_e("Invalid surrogate pair: missing low surrogate");
+                break;
+            }
+            uint16_t ch2;
             if (is_big_endian) {
-                ch = (src[i] << 8) | src[i + 1];
+                ch2 = (src[i] << 8) | src[i + 1];
             } else {
-                ch = (src[i + 1] << 8) | src[i];
+                ch2 = (src[i + 1] << 8) | src[i];
+            }
+            if (ch2 < 0xDC00 || ch2 > 0xDFFF) {
+                log_e("Invalid surrogate pair: invalid low surrogate");
+                break;
             }
             i += 2;
-            if (ch == 0x0000) break;  // null-terminiert
+            codepoint = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00);
+        } else if (ch >= 0xDC00 && ch <= 0xDFFF) {
+            log_e("Invalid surrogate pair: unexpected low surrogate");
+            break;
+        }
+
+        // UTF-16 → UTF-8
+        if (codepoint < 0x80) {
+            out.push_back(static_cast<char>(codepoint));
+        } else if (codepoint < 0x800) {
+            out.push_back(0xC0 | (codepoint >> 6));
+            out.push_back(0x80 | (codepoint & 0x3F));
+        } else if (codepoint < 0x10000) {
+            out.push_back(0xE0 | (codepoint >> 12));
+            out.push_back(0x80 | ((codepoint >> 6) & 0x3F));
+            out.push_back(0x80 | (codepoint & 0x3F));
+        } else if (codepoint < 0x110000) {
+            out.push_back(0xF0 | (codepoint >> 18));
+            out.push_back(0x80 | ((codepoint >> 12) & 0x3F));
+            out.push_back(0x80 | ((codepoint >> 6) & 0x3F));
+            out.push_back(0x80 | (codepoint & 0x3F));
+        } else {
+            log_e("Invalid codepoint");
+            break;
+        }
+    }
+
+    // Nullterminator hinzufügen
+    out.push_back('\0');
+
+    // Speicher allozieren und kopieren
+    std::size_t bytes = out.size();
+    alloc(bytes, name);
+    std::memcpy(mem.get(), out.data(), bytes);
+    return i;
+}
+
+// —————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————
+    // 📌📌📌  C O P Y _ F R O M _ I S O _ 8 8 5 9 - 1   📌📌📌
+    // convert ISO 8859-1 to UTF-8 and stop at zero terminator
+    // 0x48 0x65 0x6C 0x6C 0x6F 0x20 0xC3 0xA4 0x62 0x63 0x00  -> "Hello äbc"
 
-            // UTF-16 → UTF-8
+    size_t copy_from_iso8859_1(const uint8_t* src, const char* name = nullptr) {
+        if (!src) {
+            log_e("arg. is null");
+            return 0;
+        }
+        std::vector<char> out;
+        size_t i = 0;
+
+        while (true) {
+            uint8_t ch = src[i];
+            if (ch == 0x00) {
+                break; // 'Nullterminator'
+            }
+
+            // ISO-8859-1 → UTF-8
             if (ch < 0x80) {
-                out.push_back(static_cast<char>(ch));
-            } else if (ch < 0x800) {
-                out.push_back(0xC0 | (ch >> 6));
-                out.push_back(0x80 | (ch & 0x3F));
+                out.push_back(static_cast<char>(ch)); // Ascii area remains unchanged
             } else {
-                out.push_back(0xE0 | (ch >> 12));
-                out.push_back(0x80 | ((ch >> 6) & 0x3F));
+                // chars from 0x80 to 0xff are coded as 2-byte sequences in UTF-8
+                out.push_back(0xC0 | (ch >> 6));
                 out.push_back(0x80 | (ch & 0x3F));
             }
+            i++;
         }
+
+        // add zero terminator
         out.push_back('\0');
 
-        // save
+        // allocate and copy memory
         std::size_t bytes = out.size();
         alloc(bytes, name);
         std::memcpy(mem.get(), out.data(), bytes);
@@ -1436,6 +1521,37 @@ void unicodeToUTF8(const char* src) {
         std::size_t remaining = std::strlen(pos);
         std::memmove(str, pos, remaining + 1); // inkl. '\0'
     }
+// —————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————
+    // 📌📌📌  S H I F T _ L E F T   📌📌📌
+    // Show the contents of the buffer around n bytes to the left and fill the rest with zeros
+    // Consider UTF-16 data and the size of the allocated memory
+
+    void shift_left(int n) {
+        if (!mem || allocated_size == 0) {
+            printf("Error: No allocated memory or invalid buffer\n");
+            return;
+        }
+
+        if (n < 0 || static_cast<std::size_t>(n) > allocated_size) {
+            printf("Error: Invalid shift amount %d, allocated_size=%zu\n", n, allocated_size);
+            return;
+        }
+
+        // if (n % 2 != 0) {
+        //     printf("Warning: Shift amount %d is not even, adjusting to %d for UTF-16 alignment\n", n, n + 1);
+        //     n += 1; // make sure n is even for UTF-16
+        // }
+
+        char* str = mem.get();
+        if (n == 0) return;
+
+        // show the buffer around n bytes to the left
+        std::size_t remaining = allocated_size - n;
+        std::memmove(str, str + n, remaining);
+
+        // fill the rest of the memory with zeros
+        std::memset(str + remaining, 0, n);
+}
 // —————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————
     // 📌📌📌  C O N T A I N S  📌📌📌