@@ -4,7 +4,7 @@ import iconv from 'iconv-lite';
44import os from 'os' ;
55import path from 'path' ;
66import * as log from './debug.js' ;
7- import { strHasASCII , strHasHFKanaHira , strHasHiraKana , strOnlyASCII , strOnlyChinese , strOnlyJapanese } from './unicode.js' ;
7+ import { strHasASCII , strHasHFKanaHira , strHasHiraKana , strOnlyASCII , strOnlyChinese , strOnlyJapanese , strOnlyJapaneseHan } from './unicode.js' ;
88import { CHINESE_CHARS_3500 , MESSY_CJK_CHARS as MESSY_CJK_CHARS_ } from './unicode_data.js' ;
99
1010// https://github.com/bnoordhuis/node-iconv/
@@ -32,7 +32,7 @@ export const MESSY_CJK_CHARS = MESSY_CJK_CHARS_
3232
3333export const REGEX_MESSY_CJK = new RegExp ( `[${ MESSY_CJK_CHARS } ]` , 'u' )
3434
35- export const REGEX_MESSY_CJK_EXT = / [ \u8701 - \u883f \u9200 - \u9484 ] / u //生僻字: 虫字旁 金字旁
35+ export const REGEX_MESSY_CJK_EXT = / [ \u8720 - \u883f \u9300 - \u9484 ] / u //生僻字: 虫字旁 金字旁
3636
3737export const REGEX_MESSY_UNICODE = / [ \u007f - \u00a0 \u00c0 - \u017f \u0400 - \u1cff \u2070 - \u24ff \u0e00 - \u0e7f \u3400 - \u4dbf \uac00 - \uf8ff \ufe30 - \ufe4f \ufff0 - \uffff ] / u
3838
@@ -52,25 +52,34 @@ export function checkBadUnicode(str) {
5252 // 乱码标志 问号和黑问号
5353 results . push ( [ true , 0 , `非法字符` ] )
5454 }
55- if ( / [ \u00c0 - \u00d6 \u00d8 - \u024f \u3100 - \u312f \ua720 - \ua7ff \uab30 - \uabff ] / u. test ( str ) ) {
55+ if ( / [ \u00c0 - \u00d6 \u00d8 - \u024f \u3100 - \u312f ] / u. test ( str ) ) {
5656 // 乱码标志 拉丁字母扩展 注音符号
5757 results . push ( [ true , 2 , `拉丁字母扩展` ] )
5858 }
59- if ( / [ \u0530 - \u1cff ] / u. test ( str ) ) {
60- // 乱码标志 小众语言字母符号
61- results . push ( [ true , 3 , `小众语言符号 ` ] )
59+ if ( / [ \u3300 - \u33ff ] / u. test ( str ) ) {
60+ // 乱码标志 特殊字符
61+ results . push ( [ true , 4 , `CJK特殊字符 ` ] )
6262 }
63- if ( / [ \u3300 - \u3357 ] / u. test ( str ) ) {
64- // 乱码标志 方块片假名
65- results . push ( [ true , 4 , `方块片假名` ] )
63+ if ( / [ \u0370 - \u1cff ] / u. test ( str ) ) {
64+ // 乱码标志 小众语言符号
65+ results . push ( [ true , 3 , `小众语言A` ] )
66+ }
67+ if ( / [ \ua000 - \ua7ff \uab30 - \uabff \ud7b0 - \ud7ff ] / u. test ( str ) ) {
68+ // 乱码标志 小众语言符号
69+ results . push ( [ true , 4 , `小众语言B` ] )
70+ }
71+ if ( / [ \ud800 - \udfff ] / u. test ( str ) ) {
72+ // 乱码标志 代理对,存疑
73+ results . push ( [ true , 4 , `代理对` ] )
6674 }
6775 if ( / [ \ue000 - \uf8ff ] / u. test ( str ) ) {
6876 // 乱码标志 Unicode私有区
6977 results . push ( [ true , 5 , `私有区` ] )
7078 }
7179 if ( / [ \uff66 - \uff9d ] / u. test ( str ) ) {
80+ // 暂时忽略,还比较常用
7281 // 乱码标志 半角平假名片假名
73- results . push ( [ true , 6 , `半角假名` ] )
82+ // results.push([true, 6, `半角假名`])
7483 }
7584 if ( / [ 㼿 ] / u. test ( str ) ) {
7685 // 乱码标志 特殊生僻字
@@ -113,6 +122,8 @@ export function fixCJKEncImpl(str,
113122 // results.push([str, false, 0])
114123 return [ [ str , false , 0 , '全英文数字' , '' ] , ]
115124 }
125+ log . info ( '---------------------' )
126+ log . info ( 'fixCJKEnc' , str )
116127 if ( ! REGEX_MESSY_UNICODE . test ( str )
117128 && ! REGEX_MESSY_CJK . test ( str )
118129 && ! REGEX_MESSY_CJK_EXT . test ( str ) ) {
@@ -152,14 +163,15 @@ export function fixCJKEncImpl(str,
152163 let strDecoded = iconv . decode ( strBuffer , enc2 )
153164 const badDecoded = checkBadUnicode ( strDecoded )
154165 // const strCleaned = strDecoded.replaceAll(/[\ufffd\u0020]/ugi, '')
166+ log . info ( enc1 , enc2 , strDecoded , badDecoded )
155167 // 如果含有乱码字符
156168 if ( badDecoded ?. length > 0 ) {
157169 for ( const item of badDecoded ) {
158170 results . push ( [ strDecoded , ...item , `${ enc1 } =>${ enc2 } ` ] )
159171 }
172+
160173 continue ;
161174 }
162-
163175 // log.showRed('========')
164176 // log.showRed(str)
165177 // log.showGreen(Array.from(str).map(c => c.codePointAt(0).toString(16)))
@@ -168,8 +180,16 @@ export function fixCJKEncImpl(str,
168180 const onlyASCII = strOnlyASCII ( strDecoded )
169181 const onlyCN = strOnlyChinese ( strDecoded )
170182 const onlyJP = strOnlyJapanese ( strDecoded )
183+ const onlyJPHan = strOnlyJapaneseHan ( strDecoded )
184+ const hasHiraKana = strHasHiraKana ( strDecoded )
185+ const hasHFHiraKana = strHasHFKanaHira ( strDecoded )
171186 const messyUnicode = REGEX_MESSY_UNICODE . test ( strDecoded )
172187 const messyCJK = REGEX_MESSY_CJK . test ( strDecoded )
188+ const messyCJKExt = REGEX_MESSY_CJK_EXT . test ( strDecoded )
189+
190+ log . debug ( strDecoded , onlyASCII , onlyCN , onlyJP , onlyJPHan , messyCJK )
191+ log . debug ( strDecoded , hasHiraKana , hasHFHiraKana , messyUnicode , messyCJK )
192+
173193 if ( onlyASCII && ! strDecoded . includes ( '?' ) ) {
174194 results . push ( [ strDecoded , true , 99 , `全英文数字` , `${ enc1 } =>${ enc2 } ` ] )
175195 break
@@ -178,25 +198,21 @@ export function fixCJKEncImpl(str,
178198 results . push ( [ strDecoded , true , 99 , `常用汉字` , `${ enc1 } =>${ enc2 } ` ] )
179199 break
180200 }
181- log . debug ( strDecoded , onlyCN , onlyJP , messyUnicode , messyCJK )
201+ if ( messyCJK || messyCJKExt ) {
202+ results . push ( [ strDecoded , true , 50 , `CJK罕见` , `${ enc1 } =>${ enc2 } ` ] )
182203
183- if ( onlyJP && strHasHiraKana ( strDecoded ) ) {
184- results . push ( [ strDecoded , true , 78 , `日文字符` , `${ enc1 } =>${ enc2 } ` ] )
204+ }
205+ if ( onlyJP ) {
206+ if ( strHasHiraKana ( strDecoded ) || onlyJPHan ) {
207+ results . push ( [ strDecoded , true , 78 , `日文字符` , `${ enc1 } =>${ enc2 } ` ] )
208+ }
185209 }
186210 else if ( onlyCN ) {
187211 results . push ( [ strDecoded , true , 76 , `中文字符` , `${ enc1 } =>${ enc2 } ` ] )
188212 }
189- else if ( ! messyUnicode && ! messyCJK
190- && [ enc1 , enc2 ] . includes ( 'SHIFT_JIS' )
191- && [ enc1 , enc2 ] . includes ( 'UTF8' ) ) {
192- results . push ( [ strDecoded , true , 74 , `无特殊字符` , `${ enc1 } =>${ enc2 } ` ] )
213+ else if ( strHasHFKanaHira || strHasHiraKana ) {
214+ results . push ( [ strDecoded , true , 65 , `含日文假名` , ` ${ enc1 } =>${ enc2 } ` ] )
193215 }
194- // else if (messyCJK) {
195- // results.push([strDecoded, true, 51, `含特殊汉字`, `${enc1}=>${enc2}`])
196- // }
197- // else if (messyUnicode) {
198- // results.push([strDecoded, true, 52, `含特殊符号`, `${enc1}=>${enc2}`])
199- // }
200216 else {
201217 results . push ( [ strDecoded , true , 60 , `正常转换 ${ onlyCN } ${ onlyJP } ` , ` ${ enc1 } =>${ enc2 } ` ] )
202218 }
0 commit comments