@@ -23,6 +23,7 @@ export type QualitySignals = {
2323 bulletCount : number
2424 templateMarkerHits : number
2525 genericSummary : boolean
26+ cjkChars : number
2627 structuralFingerprint : string
2728}
2829
@@ -40,6 +41,29 @@ function stripFrontmatter(raw: string) {
4041}
4142
4243function tokenizeWords ( text : string ) {
44+ const segmenterCtor = ( Intl as typeof Intl & {
45+ Segmenter ?: new (
46+ locale ?: string | string [ ] ,
47+ options ?: { granularity ?: 'grapheme' | 'word' | 'sentence' } ,
48+ ) => {
49+ segment : (
50+ input : string ,
51+ ) => Iterable < { segment : string ; isWordLike ?: boolean } >
52+ }
53+ } ) . Segmenter
54+
55+ if ( segmenterCtor ) {
56+ const segmenter = new segmenterCtor ( undefined , { granularity : 'word' } )
57+ const tokens : string [ ] = [ ]
58+ for ( const entry of segmenter . segment ( text ) ) {
59+ if ( ! entry . isWordLike ) continue
60+ const token = entry . segment . trim ( ) . toLowerCase ( )
61+ if ( ! token ) continue
62+ tokens . push ( token )
63+ }
64+ if ( tokens . length > 0 ) return tokens
65+ }
66+
4367 return ( text . toLowerCase ( ) . match ( / [ a - z 0 - 9 ] [ a - z 0 - 9 ' - ] * / g) ?? [ ] ) . filter ( ( word ) => word . length > 1 )
4468}
4569
@@ -95,6 +119,7 @@ export function computeQualitySignals(args: {
95119 const templateMarkerHits = TEMPLATE_MARKERS . filter ( ( marker ) => bodyLower . includes ( marker ) ) . length
96120 const summary = ( args . summary ?? '' ) . trim ( ) . toLowerCase ( )
97121 const genericSummary = / ^ e x p e r t g u i d a n c e f o r [ a - z 0 - 9 - ] + \. ? $ / . test ( summary )
122+ const cjkChars = ( body . match ( / [ \p{ Script= Han} \p{ Script= Hiragana} \p{ Script= Katakana} \p{ Script= Hangul} ] / gu) ?? [ ] ) . length
98123
99124 return {
100125 bodyChars,
@@ -104,6 +129,7 @@ export function computeQualitySignals(args: {
104129 bulletCount,
105130 templateMarkerHits,
106131 genericSummary,
132+ cjkChars,
107133 structuralFingerprint : toStructuralFingerprint ( args . readmeText ) ,
108134 }
109135}
@@ -127,8 +153,14 @@ export function evaluateQuality(args: {
127153} ) : QualityAssessment {
128154 const { signals, trustTier, similarRecentCount } = args
129155 const score = scoreQuality ( signals )
130- const rejectWordsThreshold = trustTier === 'low' ? 45 : trustTier === 'medium' ? 35 : 28
131- const rejectCharsThreshold = trustTier === 'low' ? 260 : trustTier === 'medium' ? 180 : 140
156+ const cjkHeavy =
157+ signals . cjkChars >= 40 || ( signals . bodyChars > 0 && signals . cjkChars / signals . bodyChars >= 0.15 )
158+ let rejectWordsThreshold = trustTier === 'low' ? 45 : trustTier === 'medium' ? 35 : 28
159+ let rejectCharsThreshold = trustTier === 'low' ? 260 : trustTier === 'medium' ? 180 : 140
160+ if ( cjkHeavy ) {
161+ rejectWordsThreshold = Math . max ( 24 , rejectWordsThreshold - 16 )
162+ rejectCharsThreshold = Math . max ( 140 , rejectCharsThreshold - 120 )
163+ }
132164 const quarantineScoreThreshold = trustTier === 'low' ? 72 : trustTier === 'medium' ? 60 : 50
133165 const similarityRejectThreshold = trustTier === 'low' ? 5 : trustTier === 'medium' ? 8 : 12
134166
@@ -157,6 +189,7 @@ export function evaluateQuality(args: {
157189 bulletCount : signals . bulletCount ,
158190 templateMarkerHits : signals . templateMarkerHits ,
159191 genericSummary : signals . genericSummary ,
192+ cjkChars : signals . cjkChars ,
160193 } ,
161194 }
162195 }
@@ -176,6 +209,7 @@ export function evaluateQuality(args: {
176209 bulletCount : signals . bulletCount ,
177210 templateMarkerHits : signals . templateMarkerHits ,
178211 genericSummary : signals . genericSummary ,
212+ cjkChars : signals . cjkChars ,
179213 } ,
180214 }
181215 }
@@ -194,6 +228,7 @@ export function evaluateQuality(args: {
194228 bulletCount : signals . bulletCount ,
195229 templateMarkerHits : signals . templateMarkerHits ,
196230 genericSummary : signals . genericSummary ,
231+ cjkChars : signals . cjkChars ,
197232 } ,
198233 }
199234}
0 commit comments