Skip to content

Commit 139c807

Browse files
authored
Add StringUtil::toUTF8()
1 parent f8fb22c commit 139c807

File tree

8 files changed

+133
-5
lines changed

8 files changed

+133
-5
lines changed

.github/workflows/validate.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,9 @@ jobs:
5050
- "8.1"
5151
- "8.2"
5252
- "8.3"
53-
5453
dependencies:
5554
- lowest
5655
- highest
57-
5856
illuminate:
5957
- ^8.73
6058
- ^9

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ See [GitHub releases](https://github.com/mll-lab/php-utils/releases).
99

1010
## Unreleased
1111

12+
### Added
13+
14+
- Add `StringUtil::toUTF8()`
15+
1216
## v1.11.0
1317

1418
### Added

src/QxManager/FilledRow.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,9 @@ public function __construct(
5050
string $targetName,
5151
string $signalCh1,
5252
string $signalCh2,
53-
int $referenceCopies = null,
54-
string $wellNotes = null,
55-
string $rdqConversionFactor = null
53+
?int $referenceCopies = null,
54+
?string $wellNotes = null,
55+
?string $rdqConversionFactor = null
5656
) {
5757
$this->targetName = $targetName;
5858
$this->signalCh1 = $signalCh1;

src/StringUtil.php

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,21 @@
66

77
final class StringUtil
88
{
9+
/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8 */
10+
public const UTF_8_BOM = "\xEF\xBB\xBF";
11+
12+
/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-16 */
13+
public const UTF_16_BIG_ENDIAN_BOM = "\xFE\xFF";
14+
15+
/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-16 */
16+
public const UTF_16_LITTLE_ENDIAN_BOM = "\xFF\xFE";
17+
18+
/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-32 */
19+
public const UTF_32_BIG_ENDIAN_BOM = "\x00\x00\xFE\xFF";
20+
21+
/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-32 */
22+
public const UTF_32_LITTLE_ENDIAN_BOM = "\xFF\xFE\x00\x00";
23+
924
/** @param iterable<string|null> $parts */
1025
public static function joinNonEmpty(string $glue, iterable $parts): string
1126
{
@@ -82,6 +97,69 @@ public static function normalizeLineEndings(string $input, string $to = "\r\n"):
8297
return \Safe\preg_replace("/\r\n|\r|\n/", $to, $input);
8398
}
8499

100+
/** Convert string that could be in different UTF encodings (UTF-8, UTF-16BE, ...) to UTF-8. */
101+
public static function toUTF8(string $string): string
102+
{
103+
$encoding = mb_detect_encoding($string, null, true);
104+
105+
if ($encoding === false) {
106+
$encoding = self::guessEncoding($string);
107+
}
108+
109+
error_clear_last();
110+
// @phpstan-ignore-next-line \Safe\mb_convert_encoding is not available in older PHP versions
111+
$converted = mb_convert_encoding($string, 'UTF-8', $encoding);
112+
// @phpstan-ignore-next-line mb_convert_encoding can return false in older PHP versions
113+
if (! is_string($converted)) {
114+
$error = error_get_last();
115+
$notString = gettype($converted);
116+
throw new \ErrorException($error['message'] ?? "Expected mb_convert_encoding to return string, got {$notString}.", 0, $error['type'] ?? 1);
117+
}
118+
119+
return $converted;
120+
}
121+
122+
private static function guessEncoding(string $text): string
123+
{
124+
// @see https://www.php.net/manual/en/function.mb-detect-encoding.php#91051
125+
$first3 = substr($text, 0, 3);
126+
if ($first3 === self::UTF_8_BOM) {
127+
return 'UTF-8';
128+
}
129+
130+
$first4 = substr($text, 0, 3);
131+
if ($first4 === self::UTF_32_BIG_ENDIAN_BOM) {
132+
return 'UTF-32BE';
133+
}
134+
if ($first4 === self::UTF_32_LITTLE_ENDIAN_BOM) {
135+
return 'UTF-32LE';
136+
}
137+
138+
$first2 = substr($text, 0, 2);
139+
if ($first2 === self::UTF_16_BIG_ENDIAN_BOM) {
140+
return 'UTF-16BE';
141+
}
142+
if ($first2 === self::UTF_16_LITTLE_ENDIAN_BOM) {
143+
return 'UTF-16LE';
144+
}
145+
146+
// https://kence.org/2019/11/27/detecting-windows-1252-encoding
147+
// If the string contains characters in ranges that are either control characters
148+
// or invalid for ISO-8859-1 or CP-1252, we are unable to reliably guess.
149+
if (\Safe\preg_match('/[\x00-\x08\x0E-\x1F\x81\x8D\x8F\x90\x9D]/', $text, $matches) !== 0) {
150+
throw new \Exception("Can not determine UTF encoding of text: {$text}");
151+
}
152+
153+
// If we get here, we're going to assume it's either Windows-1252 or ISO-8859-1.
154+
// If the string contains characters in the ISO-8859-1 reserved range, that's probably Windows-1252.
155+
if (\Safe\preg_match('/[\x80-\x9F]/', $text) !== 0) {
156+
return 'Windows-1252';
157+
}
158+
159+
// Give up and return ISO-8859-1.
160+
return 'ISO-8859-1';
161+
}
162+
85163
/**
86164
* Pad a number with leading zero's.
87165
*

tests/StringUtilTest.php

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,47 @@ public function testNormalizeLineEndings(): void
8585
);
8686
}
8787

88+
public function testUTF8(): void
89+
{
90+
$expectedUTF8 = 'test';
91+
92+
$string = \Safe\file_get_contents(__DIR__ . '/StringUtilTestData/UTF-8.csv');
93+
94+
self::assertSame($expectedUTF8, $string);
95+
self::assertSame($expectedUTF8, StringUtil::toUTF8($string));
96+
}
97+
98+
public function testUTF16LE(): void
99+
{
100+
// The zero width no-break space (ZWNBSP) is a deprecated use of the Unicode character at code point U+FEFF.
101+
// Character U+FEFF is intended for use as a Byte Order Mark (BOM) at the start of a file
102+
// -> https://unicode-explorer.com/c/FEFF
103+
$expectedUTF8 = 'test';
104+
105+
$string = \Safe\file_get_contents(__DIR__ . '/StringUtilTestData/UTF-16LE.csv');
106+
self::assertNotSame($expectedUTF8, $string);
107+
self::assertSame($expectedUTF8, StringUtil::toUTF8($string));
108+
}
109+
110+
public function testWindows1252(): void
111+
{
112+
$expectedUTF8 = <<<CSV
113+
FileName,WellId,Sample Description,From [bp],To [bp],Average Size [bp],Conc. [ng/µl],Region Molarity [nmol/l],% of Total,Region Comment
114+
2023-05-16 - 13.01.27.D1000,A12,RNA_191_23-049780_A1,170,550,312,23.7,121,95.50,IDT
115+
2023-05-16 - 13.01.27.D1000,B12,RNA_191_23-049782_B1,170,550,308,16.1,82.5,92.27,IDT
116+
2023-05-16 - 13.01.27.D1000,C12,RNA_191_23-049776_C1,170,550,310,16.7,85.3,93.76,IDT
117+
2023-05-16 - 13.01.27.D1000,D12,RNA_191_23-049778_D1,170,550,307,11.4,58.6,91.65,IDT
118+
2023-05-16 - 13.01.27.D1000,E12,RNA_191_NTC_E1,170,550,304,9.63,50.0,90.88,IDT
119+
120+
CSV;
121+
122+
$string = \Safe\file_get_contents(__DIR__ . '/StringUtilTestData/windows-1252.csv');
123+
self::assertNotSame($expectedUTF8, $string);
124+
125+
$utf8String = StringUtil::toUTF8($string);
126+
self::assertSame(StringUtil::normalizeLineEndings($expectedUTF8), StringUtil::normalizeLineEndings($utf8String));
127+
}
128+
88129
public function testLeftPadNumber(): void
89130
{
90131
self::assertSame(

tests/StringUtilTestData/UTF-16LE.csv

10 Bytes
Binary file not shown.

tests/StringUtilTestData/UTF-8.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
test
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
FileName,WellId,Sample Description,From [bp],To [bp],Average Size [bp],Conc. [ng/�l],Region Molarity [nmol/l],% of Total,Region Comment
2+
2023-05-16 - 13.01.27.D1000,A12,RNA_191_23-049780_A1,170,550,312,23.7,121,95.50,IDT
3+
2023-05-16 - 13.01.27.D1000,B12,RNA_191_23-049782_B1,170,550,308,16.1,82.5,92.27,IDT
4+
2023-05-16 - 13.01.27.D1000,C12,RNA_191_23-049776_C1,170,550,310,16.7,85.3,93.76,IDT
5+
2023-05-16 - 13.01.27.D1000,D12,RNA_191_23-049778_D1,170,550,307,11.4,58.6,91.65,IDT
6+
2023-05-16 - 13.01.27.D1000,E12,RNA_191_NTC_E1,170,550,304,9.63,50.0,90.88,IDT

0 commit comments

Comments
 (0)