Skip to content

Commit d771c2c

Browse files
committed
support more encoding
1 parent d31572d commit d771c2c

File tree

4 files changed

+114
-28
lines changed

4 files changed

+114
-28
lines changed

demo/index.php

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<?php
2+
/**
3+
* Created by PhpStorm.
4+
* User: hocvt
5+
* Date: 5/9/18
6+
* Time: 14:27
7+
*/
8+
9+
use StupidDev\ViEncoder\Encoder\Code;
10+
use StupidDev\ViEncoder\Encoder\Converter;
11+
12+
require_once __DIR__ . "/../vendor/autoload.php";
13+
14+
$vf = "trong khu vùc c«ng ®· diÔn ra. T¸c ®éng cña c¸c cuéc c¶i c¸ch nµy kh«ng chØ lµ";
15+
16+
echo \StupidDev\ViEncoder\Encoder\Detector::usingCode( $vf);
17+
18+
echo "<br/>";
19+
20+
$unicode = Converter::changeEncode($vf, Code::CHARSET_UNICODE);
21+
22+
print_r( $unicode);

src/Encoder/Code.php

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@ class Code {
1414
const CHARSET_VNI_WIN = 'VNI-WIN';
1515
const CHARSET_VIQR = 'VIQR';
1616
const CHARSET_UNICODE = 'UNICODE';
17-
// const CHARSET_VISCII = 'VISCII';
18-
// const CHARSET_VPS_WIN = 'VPS-Win';
19-
// const CHARSET_VIETWARE_F = 'VietWare-F';
20-
// const CHARSET_VIETWARE_X = 'VietWare-X';
21-
// const CHARSET_BKHCM1 = 'B.K. HCM 1';
22-
// const CHARSET_BKHCM2 = 'B.K. HCM 2';
23-
// const CHARSET_VNU = 'VNU';
24-
// const CHARSET_COMB_UNICODE = 'Comb Unicode';
25-
// const CHARSET_ESC_UNICODE = 'Esc Unicode';
26-
// const CHARSET_UTF8 = 'UTF-8';
17+
const CHARSET_VISCII = 'VISCII';
18+
const CHARSET_VPS_WIN = 'VPS-Win';
19+
const CHARSET_VIETWARE_F = 'VietWare-F';
20+
const CHARSET_VIETWARE_X = 'VietWare-X';
21+
const CHARSET_BKHCM1 = 'B.K. HCM 1';
22+
const CHARSET_BKHCM2 = 'B.K. HCM 2';
23+
const CHARSET_VNU = 'VNU';
24+
const CHARSET_COMB_UNICODE = 'Comb Unicode';
25+
const CHARSET_ESC_UNICODE = 'Esc Unicode';
26+
const CHARSET_UTF8 = 'UTF-8';
2727
}

src/Encoder/Converter.php

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,70 @@ class Converter {
7575
"U+?", "u+?", "U+~", "u+~", "U+.", "u+.", "Y`" , "y`" , "Y." , "y." ,
7676
"Y?" , "y?" , "Y~" , "y~" , "\\."
7777
),
78+
Code::CHARSET_VISCII => array(
79+
"À", "Á", "Â", "Ã", "È", "É", "Ê", "Ì", "Í", "Ò",
80+
"Ó", "Ô", "õ", "Ù", "Ú", "Ý", "à", "á", "â", "ã",
81+
"è", "é", "ê", "ì", "í", "ò", "ó", "ô", "õ", "ù",
82+
"ú", "ý", "Å", "å", "Ð", "ð", "Î", "î", "", "û",
83+
"´", "½", "¿", "ß", "", "Õ", "Ä", "ä", "", "¤",
84+
"", "¥", "", "¦", "ç", "ç", "", "§", "", "í",
85+
"", "¢", "Æ", "Æ", "Ç", "Ç", "ƒ", "£", "", "©",
86+
"Ë", "ë", "ˆ", "¨", "Š", "ª", "", "«", "Œ", "¬",
87+
"", "­", "Ž", "®", "", "ï", "˜", "¸", "š", "÷",
88+
"", "ö", "", "¯", "", "°", "", "±", "", "²",
89+
"", "µ", "", "¾", "", "", "", "·", "³", "Þ",
90+
"", "þ", "ž", "ø", "œ", "ü", "º", "Ñ", "»", "×",
91+
"¼", "Ø", "ÿ", "æ", "¹", "ñ", "Ÿ", "Ï", "Ü", "Ü",
92+
"Ö", "Ö", "Û", "Û", "."
93+
),
94+
Code::CHARSET_VPS_WIN => array(
95+
"à", "Á", "Â", "", "×", "É", "Ê", "µ", "´", "¼",
96+
"¹", "Ô", "õ", "¨", "Ú", "Ý", "à", "á", "â", "ã",
97+
"è", "é", "ê", "ì", "í", "ò", "ó", "ô", "õ", "ù",
98+
"ú", "š", "ˆ", "æ", "ñ", "Ç", "¸", "ï", "¬", "Û",
99+
"÷", "Ö", "Ð", "Ü", "å", "å", "", "ä", "ƒ", "Ã",
100+
"", "À", "", "Ä", "Å", "Å", "Æ", "Æ", "", "í",
101+
"¢", "¢", "£", "£", "¤", "¤", "¥", "¥", "Ë", "Ë",
102+
"Þ", "È", "þ", "ë", "", "", "", "Š", "", "",
103+
"", "Í", "Œ", "Œ", "·", "Ì", "Î", "Î", "", "",
104+
"½", "Õ", "", "Ó", "", "Ò", "˜", "°", "", "",
105+
"", "", "", "§", "©", "©", "Ÿ", "ª", "¦", "«",
106+
"®", "®", "ø", "ø", "Ñ", "û", "­", "Ù", "¯", "Ø",
107+
"±", "º", "»", "»", "¿", "¿", "²", "ÿ", "œ", "œ",
108+
"", "", "Ï", "Ï", "."
109+
),
110+
Code::CHARSET_VIETWARE_X => array(
111+
"", "", "Á", "", "", "", "Ã", "Ç", "Ê", "",
112+
"", "Ä", "", "", "", "", "", "", "á", "",
113+
"", "", "ã", "ç", "ê", "", "", "ä", "", "",
114+
"", "", "À", "à", "Â", "â", "É", "é", "", "",
115+
"Å", "å", "Æ", "æ", "", "", "", "", "ÁÚ", "áú",
116+
"ÁÖ", "áö", "ÁØ", "áø", "ÁÙ", "áù", "ÁÛ", "áû", "ÀÕ", "àõ",
117+
"ÀÒ", "àò", "ÀÓ", "àó", "ÀÔ", "àô", "ÀÛ", "àû", "", "",
118+
"", "", "", "", "ÃÚ", "ãú", "ÃÖ", "ãö", "ÃØ", "ãø",
119+
"ÃÙ", "ãù", "ÃÛ", "ãû", "È", "è", "Ë", "ë", "", "",
120+
"", "", "ÄÚ", "äú", "ÄÖ", "äö", "ÄØ", "äø", "ÄÙ", "äù",
121+
"ÄÜ", "äü", "ÅÏ", "åï", "ÅÌ", "åì", "ÅÍ", "åí", "ÅÎ", "åî",
122+
"ÅÜ", "åü", "", "", "", "", "ÆÏ", "æï", "ÆÌ", "æì",
123+
"ÆÍ", "æí", "ÆÎ", "æî", "ÆÛ", "æû", "", "", "", "",
124+
"", "", "", "", "."
125+
),
126+
Code::CHARSET_VIETWARE_F => array(
127+
"Š", " ", "", "š", "¬", "¯", "ƒ", "¸", "»", "¿",
128+
"â", "„", "á", "î", "ò", "ü", "ª", "À", "¡", "º",
129+
"Ì", "Ï", "£", "Ø", "Û", "ß", "â", "¤", "á", "î",
130+
"ò", "ü", "", "Ÿ", "˜", "¢", "Ú", "Ú", "ñ", "ñ",
131+
"…", "¥", "‡", "§", "Á", "Á", "–", "", "Ê", "Ê",
132+
"Ç", "Ç", "¨", "È", "©", "É", "«", "Ë", "Å", "Å",
133+
"Â", "Â", "Ã", "Ã", "Ä", "Ä", "¦", "Æ", "±", "Ñ",
134+
"­", "Í", "®", "Î", "µ", "Õ", "²", "Ò", "³", "Ó",
135+
"´", "Ô", "Ö", "Ö", "¹", "Ù", "¼", "Ü", "ã", "ã",
136+
"à", "à", "ç", "ç", "ä", "ä", "å", "å", "æ", "æ",
137+
"è", "è", "ì", "ì", "é", "é", "ê", "ê", "ë", "ë",
138+
"í", "í", "ó", "ó", "ï", "ï", "×", "÷", "ô", "ô",
139+
"õ", "õ", "ö", "ö", "ø", "ø", "ù", "ù", "ÿ", "ÿ",
140+
"ú", "ú", "û", "û", "."
141+
),
78142
);
79143

80144
public static function changeEncode($string, $targetEncode, $sourceEncode = null){
@@ -88,7 +152,7 @@ public static function changeEncode($string, $targetEncode, $sourceEncode = null
88152
}else{
89153
$sourceEncode = Detector::usingCode($string);
90154
if(!$sourceEncode){
91-
throw new UnknowCodeException("Không xác định được bảng mã đang dùng của văn bản");
155+
throw new UnknowCodeException("\x0Không xác định được bảng mã đang dùng của văn bản");
92156
}
93157
}
94158

src/Encoder/Detector.php

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,20 @@
1111

1212
class Detector {
1313
private static $patterns = [
14-
Code::CHARSET_TCVN3 => '/\w­[¬íêîëì]|®[¸µ¹¶·Ê¾»Æ¼½ÌÑÎϪÕÒÖÓÔÝ×ÞØÜãßäáâ«èåéæç¬íêîëìóïôñòøõùö]/',
15-
Code::CHARSET_VNI_WIN => '/[öô][ùøïûõ]|oa[ëùøïûõ]|ñ[aoeuôö][äàáåãùøïûõ]/i',
16-
Code::CHARSET_VIQR => '/u[\+\*]o[\+\*]|dd[aoe][\(\^~\'`]|[aoe]\^[~`\'\.\?]|[uo]\+[`\'~\.\?]|a\([\'`~\.\?]/i',
14+
Code::CHARSET_TCVN3 => '/\w­[¬íêîëì]|®[¸µ¹¶·Ê¾»Æ¼½ÌÑÎϪÕÒÖÓÔÝ×ÞØÜãßäáâ«èåéæç¬íêîëìóïôñòøõùö]/u',
15+
Code::CHARSET_VNI_WIN => '/[öô][ùøïûõ]|oa[ëùøïûõ]|ñ[aoeuôö][äàáåãùøïûõ]/iu',
16+
Code::CHARSET_VIQR => '/u[\+\*]o[\+\*]|dd[aoe][\(\^~\'`]|[aoe]\^[~`\'\.\?]|[uo]\+[`\'~\.\?]|a\([\'`~\.\?]/iu',
1717
Code::CHARSET_UNICODE => '/[Ạ-ỹ]/',
18-
// Code::CHARSET_VISCII => '/\wß[½¾¶þ·Þ]|ð[áàÕäã¤í¢£ÆÇè©ë¨êª«®¬­íì¸ïîóò÷öõô¯°µ±²½¾¶þ·ÞúùøüûÑ×ñØ]/',
19-
// Code::CHARSET_VPS_WIN => '/\wÜ[Ö§©®ª«]|Ç[áàåäãÃí¢¥£¤èËÈëêÍíìÎÌïóòÕõôÓÒ¶°Ö§©®ª«úùøûÛÙØ¿º]/',
20-
// Code::CHARSET_VIETWARE_F => '/\w§[¥ìéíêë]|¢[ÀªÁ¶ºÊÛÂÆÃÄÌÑÍΣÕÒÖÓÔÛØÜÙÚâßãàá¤çäè忥ìéíêëòîóïñ÷ôøõ]/',
21-
// Code::CHARSET_VIETWARE_X => '/[áãä][úöûøù]|à[õòûóô]|[åæ][ïìüíî]/i',
22-
// Code::CHARSET_BKHCM1 => '/\wõ[ïðñôòó]|\s½[ÚÛÃÄÇÈÉÊÑÐíôóÒÓÔÕ]/',
23-
// Code::CHARSET_BKHCM2 => '/\w[êöï][ëìåíî]|úû[áâåãä]|ù[æçåèé]/i',
24-
// Code::CHARSET_VNU => '/\wõ[çèéìêë]|\s½[?¡­¨¬µ¶·º¸¹¯°±´²³]/',
25-
// Code::CHARSET_COMB_UNICODE => '/[̣́̀̉̃]/i',
26-
// Code::CHARSET_UTF8 => '/(áº|á»)[¥¤§¦¬©¨«ª¯®±°·¶³²º½¼¾¿¡£¢]/i',
27-
// Code::CHARSET_ESC_UNICODE => '/&#\d\d\d\d;/i',
18+
Code::CHARSET_VISCII => '/\wß[½¾¶þ·Þ]|ð[áàÕäã¤í¢£ÆÇè©ë¨êª«®¬­íì¸ïîóò÷öõô¯°µ±²½¾¶þ·ÞúùøüûÑ×ñØ]/u',
19+
Code::CHARSET_VPS_WIN => '/\wÜ[Ö§©®ª«]|Ç[áàåäãÃí¢¥£¤èËÈëêÍíìÎÌïóòÕõôÓÒ¶°Ö§©®ª«úùøûÛÙØ¿º]/u',
20+
Code::CHARSET_VIETWARE_F => '/\w§[¥ìéíêë]|¢[ÀªÁ¶ºÊÛÂÆÃÄÌÑÍΣÕÒÖÓÔÛØÜÙÚâßãàá¤çäè忥ìéíêëòîóïñ÷ôøõ]/u',
21+
Code::CHARSET_VIETWARE_X => '/[áãä][úöûøù]|à[õòûóô]|[åæ][ïìüíî]/iu',
22+
// Code::CHARSET_BKHCM1 => '/\wõ[ïðñôòó]|\s½[ÚÛÃÄÇÈÉÊÑÐíôóÒÓÔÕ]/u',
23+
// Code::CHARSET_BKHCM2 => '/\w[êöï][ëìåíî]|úû[áâåãä]|ù[æçåèé]/iu',
24+
// Code::CHARSET_VNU => '/\wõ[çèéìêë]|\s½[?¡­¨¬µ¶·º¸¹¯°±´²³]/u',
25+
// Code::CHARSET_COMB_UNICODE => '/[̣́̀̉̃]/iu',
26+
// Code::CHARSET_UTF8 => '/(áº|á»)[¥¤§¦¬©¨«ª¯®±°·¶³²º½¼¾¿¡£¢]/ui',
27+
// Code::CHARSET_ESC_UNICODE => '/&#\d\d\d\d;/iu',
2828
];
2929

3030
/**
@@ -35,11 +35,11 @@ class Detector {
3535
// "VNU",
3636
// "B.K. HCM 2",
3737
// "B.K. HCM 1",
38-
// "VietWare-X",
39-
// "VietWare-F",
38+
"VietWare-X",
39+
"VietWare-F",
4040
"VIQR",
41-
// "VPS-Win",
42-
// "VISCII",
41+
"VPS-Win",
42+
"VISCII",
4343
"TCVN-3",
4444
"VNI-WIN",
4545
// "&#Unicode;",

0 commit comments

Comments
 (0)