1- import dayjs from 'dayjs' ;
2- import fs from 'fs-extra' ;
1+
32import iconv from 'iconv-lite' ;
4- import os from 'os' ;
5- import path from 'path' ;
63import * as log from './debug.js' ;
7- import { strHasASCII , strHasHFKanaHira , strHasHiraKana , strOnlyASCII , strOnlyChinese , strOnlyJapanese , strOnlyJapaneseHan } from './unicode.js' ;
4+ import { strHasASCII , strHasHFKanaHira , strHasHiraKana , strOnlyASCII , strOnlyChinese , strOnlyHangul , strOnlyJapanese , strOnlyJapaneseHan } from './unicode.js' ;
85import { CHINESE_CHARS_3500 , MESSY_CJK_CHARS as MESSY_CJK_CHARS_ } from './unicode_data.js' ;
96
10- // https://github.com/bnoordhuis/node-iconv/
7+ // https://github.com/bnoordhuis/node-iconv/
118const ENCODING_FROM = [
129 'SHIFT_JIS' ,
1310 'GBK' ,
@@ -26,8 +23,6 @@ const ENCODING_TO = [
2623 // 'EUC-KR',
2724]
2825
29- const ENCODING_TRY = [ 'SHIFT_JIS' , 'UTF8' ]
30-
3126export const MESSY_CJK_CHARS = MESSY_CJK_CHARS_
3227
3328export const REGEX_MESSY_CJK = new RegExp ( `[${ MESSY_CJK_CHARS } ]` , 'u' )
@@ -43,9 +38,6 @@ export function charUnique(str) {
4338 return String . prototype . concat . call ( ...new Set ( str ) ) ;
4439}
4540
46- const nowDateStr = dayjs ( ) . format ( "YYYYMMDDHHmmss" ) ;
47- const tempfile = path . join ( os . tmpdir ( ) , `z_mediac_log_${ nowDateStr } .txt` )
48-
4941export function checkBadUnicode ( str ) {
5042 const results = [ ]
5143 if ( str . includes ( '?' ) || str . includes ( '\ufffd' ) ) {
@@ -76,6 +68,10 @@ export function checkBadUnicode(str) {
7668 // 乱码标志 Unicode私有区
7769 results . push ( [ true , 5 , `私有区` ] )
7870 }
71+ if ( / [ \ufb50 - \ufdff \ufe70 - \ufeff ] / u. test ( str ) ) {
72+ // 乱码标志 阿拉伯字符
73+ results . push ( [ true , 5 , `阿拉伯字符` ] )
74+ }
7975 if ( / [ \uff66 - \uff9d ] / u. test ( str ) ) {
8076 // 暂时忽略,还比较常用
8177 // 乱码标志 半角平假名片假名
@@ -96,8 +92,8 @@ export function hasBadCJKChar(str) {
9692 return REGEX_MESSY_CJK . test ( str ) || REGEX_MESSY_CJK_EXT . test ( str )
9793}
9894
99- export function fixCJKEnc ( str ) {
100- let results = fixCJKEncImpl ( str )
95+ export function decodeText ( str ) {
96+ let results = tryDecodeText ( str )
10197 results = results . filter ( r => r [ 2 ] >= 0 ) . sort ( ( a , b ) => b [ 2 ] - a [ 2 ] )
10298 log . debug ( '==================================' )
10399 log . debug ( str )
@@ -109,110 +105,110 @@ export function fixCJKEnc(str) {
109105 return results [ 0 ] || [ str , false , 0 , 'fallback' ] ;
110106}
111107
112- export function fixCJKEncImpl ( str ,
108+ export function tryDecodeText ( str ,
113109 fromEnc = ENCODING_FROM ,
114110 toEnc = ENCODING_TO ,
115111 threhold = 10 ) {
116112 if ( str . includes ( '?' ) || str . includes ( '\ufffd' ) ) {
117- return [ [ str , false , 0 , '信息丢失' , ' '] , ]
113+ return [ [ str , false , 0 , '[乱码字符] ' ] , ]
118114 }
119115
116+ fromEnc = fromEnc . map ( x => x . toLowerCase ( ) )
117+ toEnc = toEnc . map ( x => x . toLowerCase ( ) )
118+
120119 let results = [ ]
121120 if ( strOnlyASCII ( str ) ) {
122121 // results.push([str, false, 0])
123- return [ [ str , false , 0 , '全英文数字' , ' '] , ]
122+ return [ [ str , false , 100 , '[ASCII] ' ] , ]
124123 }
124+ const messyUnicode = REGEX_MESSY_UNICODE . test ( str )
125+ const messyCJK = REGEX_MESSY_CJK . test ( str )
126+ const messyCJKExt = REGEX_MESSY_CJK_EXT . test ( str )
125127 log . info ( '---------------------' )
126- log . info ( 'fixCJKEnc' , str )
127- if ( ! REGEX_MESSY_UNICODE . test ( str )
128- && ! REGEX_MESSY_CJK . test ( str )
129- && ! REGEX_MESSY_CJK_EXT . test ( str ) ) {
130- if ( RE_CHARS_MOST_USED . test ( str ) ) {
131- results . push ( [ str , false , 100 , '常用汉字0' , '' ] )
132- }
133- // else if (strOnlyChinese(str)) {
134- // results.push([str, false, 99, '全中文01', ''])
135- // }
136- else if ( strHasHFKanaHira ( str ) ) {
137- // 包含不用的全角半角平假名片假名
138- results . push ( [ str , false , 65 , '含半角假名0' , '' ] )
139- }
140- else {
141- // fs.appendFileSync(tempfile, str + '\n')
142- return [ [ str , false , 0 , '忽略0' , '' ] , ]
143- }
144- } else {
128+ log . info ( 'tryDecodeText' , str )
129+ if ( messyUnicode || messyCJK || messyCJKExt ) {
145130 if ( strOnlyChinese ( str ) && ! REGEX_MESSY_CJK_EXT . test ( str ) ) {
146- return [ [ str , false , 0 , `全中文02` , ` ${ REGEX_MESSY_UNICODE . test ( str ) } `] , ]
131+ return [ [ str , false , 100 , `[全中文] ` ] , ]
147132 }
148133 }
149- if ( ( strHasHiraKana ( str ) || strHasASCII ( str ) )
150- && strOnlyJapanese ( str ) && ! REGEX_MESSY_CJK . test ( str ) ) {
151- results . push ( [ str , false , 99 , '全日文01' , '' ] )
134+ if ( RE_CHARS_MOST_USED . test ( str ) ) {
135+ results . push ( [ str , false , 100 , '[常用汉字]' ] )
136+ }
137+ else if ( strHasHFKanaHira ( str ) ) {
138+ // 包含不用的全角半角平假名片假名
139+ results . push ( [ str , false , 65 , '[半角假名]' ] )
140+ }
141+ else {
142+ // fs.appendFileSync(tempfile, str + '\n')
143+ return [ [ str , false , 0 , '[无乱码]' , '' ] , ]
144+ }
145+
146+ if ( ! ! REGEX_MESSY_CJK . test ( str )
147+ && ( strHasHiraKana ( str ) || strHasASCII ( str ) )
148+ && strOnlyJapanese ( str ) ) {
149+ results . push ( [ str , false , 99 , '[全日文1]' ] )
152150 }
153151 else if ( strOnlyJapanese ( str ) ) {
154- results . push ( [ str , false , 80 , '全日文02' , ' '] )
152+ results . push ( [ str , false , 80 , '[全日文2] ' ] )
155153 }
156- // log.showRed(str)
157- // log.show(Array.from(str).map(c => c.codePointAt(0).toString(16)).join(' '))
154+
158155 for ( const enc1 of fromEnc ) {
159156 for ( const enc2 of toEnc ) {
157+ // 忽略解码编码相同的情况
160158 if ( enc1 === enc2 ) { continue }
161159 try {
162160 const strBuffer = iconv . encode ( str , enc1 )
163161 let strDecoded = iconv . decode ( strBuffer , enc2 )
164162 const badDecoded = checkBadUnicode ( strDecoded )
165163 // const strCleaned = strDecoded.replaceAll(/[\ufffd\u0020]/ugi, '')
166- log . info ( enc1 , enc2 , strDecoded , badDecoded )
164+ log . debug ( enc1 , enc2 , strDecoded , badDecoded )
167165 // 如果含有乱码字符
168166 if ( badDecoded ?. length > 0 ) {
169167 for ( const item of badDecoded ) {
170168 results . push ( [ strDecoded , ...item , `${ enc1 } =>${ enc2 } ` ] )
171169 }
172-
173170 continue ;
174171 }
175- // log.showRed('========')
176- // log.showRed(str)
177- // log.showGreen(Array.from(str).map(c => c.codePointAt(0).toString(16)))
178- // log.show(strDecoded, enc1, enc2)
179-
180172 const onlyASCII = strOnlyASCII ( strDecoded )
181173 const onlyCN = strOnlyChinese ( strDecoded )
182174 const onlyJP = strOnlyJapanese ( strDecoded )
183175 const onlyJPHan = strOnlyJapaneseHan ( strDecoded )
176+ const onlyKR = strOnlyHangul ( strDecoded )
184177 const hasHiraKana = strHasHiraKana ( strDecoded )
185178 const hasHFHiraKana = strHasHFKanaHira ( strDecoded )
186179 const messyUnicode = REGEX_MESSY_UNICODE . test ( strDecoded )
187180 const messyCJK = REGEX_MESSY_CJK . test ( strDecoded )
188181 const messyCJKExt = REGEX_MESSY_CJK_EXT . test ( strDecoded )
189182
190- log . debug ( strDecoded , onlyASCII , onlyCN , onlyJP , onlyJPHan , messyCJK )
191- log . debug ( strDecoded , hasHiraKana , hasHFHiraKana , messyUnicode , messyCJK )
183+ log . debug ( strDecoded , 'cn' , onlyCN , 'jp' , onlyJP , 'jhan' , onlyJPHan , 'kr' , onlyKR )
184+ log . debug ( strDecoded , 'hk' , hasHiraKana , 'hf' , hasHFHiraKana , 'mu' , messyUnicode , 'mc' , messyCJK )
192185
193186 if ( onlyASCII && ! strDecoded . includes ( '?' ) ) {
194- results . push ( [ strDecoded , true , 99 , `全英文数字 ` , `${ enc1 } =>${ enc2 } ` ] )
187+ results . push ( [ strDecoded , true , 99 , `ASCII ` , `${ enc1 } =>${ enc2 } ` ] )
195188 break
196189 }
197190 if ( RE_CHARS_MOST_USED . test ( strDecoded ) ) {
198191 results . push ( [ strDecoded , true , 99 , `常用汉字` , `${ enc1 } =>${ enc2 } ` ] )
199192 break
200193 }
201- if ( messyCJK || messyCJKExt ) {
202- results . push ( [ strDecoded , true , 50 , `CJK罕见` , `${ enc1 } =>${ enc2 } ` ] )
203-
204- }
205194 if ( onlyJP ) {
206- if ( strHasHiraKana ( strDecoded ) || onlyJPHan ) {
195+ if ( ! strHasHiraKana ( strDecoded ) && ! onlyJPHan ) {
207196 results . push ( [ strDecoded , true , 78 , `日文字符` , `${ enc1 } =>${ enc2 } ` ] )
208197 }
209198 }
210199 else if ( onlyCN ) {
211200 results . push ( [ strDecoded , true , 76 , `中文字符` , `${ enc1 } =>${ enc2 } ` ] )
212201 }
213- else if ( strHasHFKanaHira || strHasHiraKana ) {
202+ else if ( hasHiraKana || hasHFHiraKana ) {
214203 results . push ( [ strDecoded , true , 65 , `含日文假名` , ` ${ enc1 } =>${ enc2 } ` ] )
215204 }
205+ else if ( onlyKR ) {
206+ results . push ( [ strDecoded , true , 62 , `韩文字符` , `${ enc1 } =>${ enc2 } ` ] )
207+ }
208+ else if ( messyCJK || messyCJKExt ) {
209+ results . push ( [ strDecoded , true , 51 , `生僻字` , `${ enc1 } =>${ enc2 } ` ] )
210+ // continue
211+ }
216212 else {
217213 results . push ( [ strDecoded , true , 60 , `正常转换 ${ onlyCN } ${ onlyJP } ` , ` ${ enc1 } =>${ enc2 } ` ] )
218214 }
@@ -222,7 +218,7 @@ export function fixCJKEncImpl(str,
222218 }
223219 }
224220 }
225- results . push ( [ str , false , 70 , '原始字符串 ' ] )
221+ results . push ( [ str , false , 70 , '原始值 ' ] )
226222 results = results . filter ( r => r [ 2 ] >= threhold ) . sort ( ( a , b ) => b [ 2 ] - a [ 2 ] )
227223 log . debug ( results . slice ( 3 ) )
228224 return results ;
0 commit comments