@@ -45,7 +45,7 @@ struct bucket {
45
45
};
46
46
47
47
/**
48
- * calc_shannon_entropy () - Compute Shannon entropy of the sampled data.
48
+ * has_low_entropy () - Compute Shannon entropy of the sampled data.
49
49
* @bkt: Bytes counts of the sample.
50
50
* @slen: Size of the sample.
51
51
*
@@ -60,7 +60,7 @@ struct bucket {
60
60
* Also Shannon entropy is the last computed heuristic; if we got this far and ended up
61
61
* with uncertainty, just stay on the safe side and call it uncompressible.
62
62
*/
63
- static bool calc_shannon_entropy (struct bucket * bkt , size_t slen )
63
+ static bool has_low_entropy (struct bucket * bkt , size_t slen )
64
64
{
65
65
const size_t threshold = 65 , max_entropy = 8 * ilog2 (16 );
66
66
size_t i , p , p2 , len , sum = 0 ;
@@ -79,17 +79,21 @@ static bool calc_shannon_entropy(struct bucket *bkt, size_t slen)
79
79
return ((sum * 100 / max_entropy ) <= threshold );
80
80
}
81
81
82
+ #define BYTE_DIST_BAD 0
83
+ #define BYTE_DIST_GOOD 1
84
+ #define BYTE_DIST_MAYBE 2
82
85
/**
83
86
* calc_byte_distribution() - Compute byte distribution on the sampled data.
84
87
* @bkt: Byte counts of the sample.
85
88
* @slen: Size of the sample.
86
89
*
87
90
* Return:
88
- * 1: High probability (normal (Gaussian) distribution) of the data being compressible.
89
- * 0: A "hard no" for compression -- either a computed uniform distribution of the bytes (e.g.
90
- * random or encrypted data), or calc_shannon_entropy() returned false (see above).
91
- * 2: When computed byte distribution resulted in "low > n < high" grounds.
92
- * calc_shannon_entropy() should be used for a final decision.
91
+ * BYTE_DIST_BAD: A "hard no" for compression -- a computed uniform distribution of
92
+ * the bytes (e.g. random or encrypted data).
93
+ * BYTE_DIST_GOOD: High probability (normal (Gaussian) distribution) of the data being
94
+ * compressible.
95
+ * BYTE_DIST_MAYBE: When computed byte distribution resulted in "low > n < high"
96
+ * grounds. has_low_entropy() should be used for a final decision.
93
97
*/
94
98
static int calc_byte_distribution (struct bucket * bkt , size_t slen )
95
99
{
@@ -101,7 +105,7 @@ static int calc_byte_distribution(struct bucket *bkt, size_t slen)
101
105
sum += bkt [i ].count ;
102
106
103
107
if (sum > threshold )
104
- return i ;
108
+ return BYTE_DIST_BAD ;
105
109
106
110
for (; i < high && bkt [i ].count > 0 ; i ++ ) {
107
111
sum += bkt [i ].count ;
@@ -110,36 +114,29 @@ static int calc_byte_distribution(struct bucket *bkt, size_t slen)
110
114
}
111
115
112
116
if (i <= low )
113
- return 1 ;
117
+ return BYTE_DIST_GOOD ;
114
118
115
119
if (i >= high )
116
- return 0 ;
120
+ return BYTE_DIST_BAD ;
117
121
118
- return 2 ;
122
+ return BYTE_DIST_MAYBE ;
119
123
}
120
124
121
- static bool check_ascii_bytes (const struct bucket * bkt )
125
+ static bool is_mostly_ascii (const struct bucket * bkt )
122
126
{
123
- const size_t threshold = 64 ;
124
127
size_t count = 0 ;
125
128
int i ;
126
129
127
- for (i = 0 ; i < threshold ; i ++ )
130
+ for (i = 0 ; i < 256 ; i ++ )
128
131
if (bkt [i ].count > 0 )
129
- count ++ ;
132
+ /* Too many non-ASCII (0-63) bytes. */
133
+ if (++ count > 64 )
134
+ return false;
130
135
131
- for (; i < 256 ; i ++ ) {
132
- if (bkt [i ].count > 0 ) {
133
- count ++ ;
134
- if (count > threshold )
135
- break ;
136
- }
137
- }
138
-
139
- return (count < threshold );
136
+ return true;
140
137
}
141
138
142
- static bool check_repeated_data (const u8 * sample , size_t len )
139
+ static bool has_repeated_data (const u8 * sample , size_t len )
143
140
{
144
141
size_t s = len / 2 ;
145
142
@@ -222,71 +219,79 @@ static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample)
222
219
* is_compressible() - Determines if a chunk of data is compressible.
223
220
* @data: Iterator containing uncompressed data.
224
221
*
225
- * Return:
226
- * 0: @data is not compressible
227
- * 1: @data is compressible
228
- * -ENOMEM: failed to allocate memory for sample buffer
222
+ * Return: true if @data is compressible, false otherwise.
229
223
*
230
224
* Tests shows that this function is quite reliable in predicting data compressibility,
231
225
* matching close to 1:1 with the behaviour of LZ77 compression success and failures.
232
226
*/
233
- static int is_compressible (const struct iov_iter * data )
227
+ static bool is_compressible (const struct iov_iter * data )
234
228
{
235
229
const size_t read_size = SZ_2K , bkt_size = 256 , max = SZ_4M ;
236
230
struct bucket * bkt = NULL ;
237
- int i = 0 , ret = 0 ;
238
231
size_t len ;
239
232
u8 * sample ;
233
+ bool ret = false;
234
+ int i ;
240
235
236
+ /* Preventive double check -- already checked in should_compress(). */
241
237
len = iov_iter_count (data );
242
- if (len < read_size )
243
- return 0 ;
238
+ if (unlikely ( len < read_size ) )
239
+ return ret ;
244
240
245
241
if (len - read_size > max )
246
242
len = max ;
247
243
248
244
sample = kvzalloc (len , GFP_KERNEL );
249
- if (!sample )
250
- return - ENOMEM ;
245
+ if (!sample ) {
246
+ WARN_ON_ONCE (1 );
247
+
248
+ return ret ;
249
+ }
251
250
252
251
/* Sample 2K bytes per page of the uncompressed data. */
253
- ret = collect_sample (data , len , sample );
254
- if (ret < 0 )
252
+ i = collect_sample (data , len , sample );
253
+ if (i <= 0 ) {
254
+ WARN_ON_ONCE (1 );
255
+
255
256
goto out ;
257
+ }
256
258
257
- len = ret ;
258
- ret = 1 ;
259
+ len = i ;
260
+ ret = true ;
259
261
260
- if (check_repeated_data (sample , len ))
262
+ if (has_repeated_data (sample , len ))
261
263
goto out ;
262
264
263
265
bkt = kcalloc (bkt_size , sizeof (* bkt ), GFP_KERNEL );
264
266
if (!bkt ) {
265
- kvfree (sample );
266
- return - ENOMEM ;
267
+ WARN_ON_ONCE (1 );
268
+ ret = false;
269
+
270
+ goto out ;
267
271
}
268
272
269
273
for (i = 0 ; i < len ; i ++ )
270
274
bkt [sample [i ]].count ++ ;
271
275
272
- if (check_ascii_bytes (bkt ))
276
+ if (is_mostly_ascii (bkt ))
273
277
goto out ;
274
278
275
279
/* Sort in descending order */
276
280
sort (bkt , bkt_size , sizeof (* bkt ), cmp_bkt , NULL );
277
281
278
- ret = calc_byte_distribution (bkt , len );
279
- if (ret != 2 )
282
+ i = calc_byte_distribution (bkt , len );
283
+ if (i != BYTE_DIST_MAYBE ) {
284
+ ret = !!i ;
285
+
280
286
goto out ;
287
+ }
281
288
282
- ret = calc_shannon_entropy (bkt , len );
289
+ ret = has_low_entropy (bkt , len );
283
290
out :
284
291
kvfree (sample );
285
292
kfree (bkt );
286
293
287
- WARN (ret < 0 , "%s: ret=%d\n" , __func__ , ret );
288
-
289
- return !!ret ;
294
+ return ret ;
290
295
}
291
296
292
297
bool should_compress (const struct cifs_tcon * tcon , const struct smb_rqst * rq )
@@ -305,7 +310,7 @@ bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq)
305
310
if (shdr -> Command == SMB2_WRITE ) {
306
311
const struct smb2_write_req * wreq = rq -> rq_iov -> iov_base ;
307
312
308
- if (wreq -> Length < SMB_COMPRESS_MIN_LEN )
313
+ if (le32_to_cpu ( wreq -> Length ) < SMB_COMPRESS_MIN_LEN )
309
314
return false;
310
315
311
316
return is_compressible (& rq -> rq_iter );
0 commit comments