Skip to content

Commit 5ac1f99

Browse files
ematsumiyasmfrench
authored andcommitted
smb: client: fix compression heuristic functions
Change is_compressible() return type to bool, use WARN_ON_ONCE(1) for internal errors and return false for those. Renames: check_repeated_data -> has_repeated_data check_ascii_bytes -> is_mostly_ascii (also refactor into a single loop) calc_shannon_entropy -> has_low_entropy Also wraps "wreq->Length" in le32_to_cpu() in should_compress() (caught by sparse). Signed-off-by: Enzo Matsumiya <[email protected]> Suggested-by: Dan Carpenter <[email protected]> Signed-off-by: Steve French <[email protected]>
1 parent 3740884 commit 5ac1f99

File tree

1 file changed

+55
-50
lines changed

1 file changed

+55
-50
lines changed

fs/smb/client/compress.c

Lines changed: 55 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ struct bucket {
4545
};
4646

4747
/**
48-
* calc_shannon_entropy() - Compute Shannon entropy of the sampled data.
48+
* has_low_entropy() - Compute Shannon entropy of the sampled data.
4949
* @bkt: Bytes counts of the sample.
5050
* @slen: Size of the sample.
5151
*
@@ -60,7 +60,7 @@ struct bucket {
6060
* Also Shannon entropy is the last computed heuristic; if we got this far and ended up
6161
* with uncertainty, just stay on the safe side and call it uncompressible.
6262
*/
63-
static bool calc_shannon_entropy(struct bucket *bkt, size_t slen)
63+
static bool has_low_entropy(struct bucket *bkt, size_t slen)
6464
{
6565
const size_t threshold = 65, max_entropy = 8 * ilog2(16);
6666
size_t i, p, p2, len, sum = 0;
@@ -79,17 +79,21 @@ static bool calc_shannon_entropy(struct bucket *bkt, size_t slen)
7979
return ((sum * 100 / max_entropy) <= threshold);
8080
}
8181

82+
#define BYTE_DIST_BAD 0
83+
#define BYTE_DIST_GOOD 1
84+
#define BYTE_DIST_MAYBE 2
8285
/**
8386
* calc_byte_distribution() - Compute byte distribution on the sampled data.
8487
* @bkt: Byte counts of the sample.
8588
* @slen: Size of the sample.
8689
*
8790
* Return:
88-
* 1: High probability (normal (Gaussian) distribution) of the data being compressible.
89-
* 0: A "hard no" for compression -- either a computed uniform distribution of the bytes (e.g.
90-
* random or encrypted data), or calc_shannon_entropy() returned false (see above).
91-
* 2: When computed byte distribution resulted in "low > n < high" grounds.
92-
* calc_shannon_entropy() should be used for a final decision.
91+
* BYTE_DIST_BAD: A "hard no" for compression -- a computed uniform distribution of
92+
* the bytes (e.g. random or encrypted data).
93+
* BYTE_DIST_GOOD: High probability (normal (Gaussian) distribution) of the data being
94+
* compressible.
95+
* BYTE_DIST_MAYBE: When computed byte distribution resulted in "low > n < high"
96+
* grounds. has_low_entropy() should be used for a final decision.
9397
*/
9498
static int calc_byte_distribution(struct bucket *bkt, size_t slen)
9599
{
@@ -101,7 +105,7 @@ static int calc_byte_distribution(struct bucket *bkt, size_t slen)
101105
sum += bkt[i].count;
102106

103107
if (sum > threshold)
104-
return i;
108+
return BYTE_DIST_BAD;
105109

106110
for (; i < high && bkt[i].count > 0; i++) {
107111
sum += bkt[i].count;
@@ -110,36 +114,29 @@ static int calc_byte_distribution(struct bucket *bkt, size_t slen)
110114
}
111115

112116
if (i <= low)
113-
return 1;
117+
return BYTE_DIST_GOOD;
114118

115119
if (i >= high)
116-
return 0;
120+
return BYTE_DIST_BAD;
117121

118-
return 2;
122+
return BYTE_DIST_MAYBE;
119123
}
120124

121-
static bool check_ascii_bytes(const struct bucket *bkt)
125+
static bool is_mostly_ascii(const struct bucket *bkt)
122126
{
123-
const size_t threshold = 64;
124127
size_t count = 0;
125128
int i;
126129

127-
for (i = 0; i < threshold; i++)
130+
for (i = 0; i < 256; i++)
128131
if (bkt[i].count > 0)
129-
count++;
132+
/* Too many non-ASCII (0-63) bytes. */
133+
if (++count > 64)
134+
return false;
130135

131-
for (; i < 256; i++) {
132-
if (bkt[i].count > 0) {
133-
count++;
134-
if (count > threshold)
135-
break;
136-
}
137-
}
138-
139-
return (count < threshold);
136+
return true;
140137
}
141138

142-
static bool check_repeated_data(const u8 *sample, size_t len)
139+
static bool has_repeated_data(const u8 *sample, size_t len)
143140
{
144141
size_t s = len / 2;
145142

@@ -222,71 +219,79 @@ static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample)
222219
* is_compressible() - Determines if a chunk of data is compressible.
223220
* @data: Iterator containing uncompressed data.
224221
*
225-
* Return:
226-
* 0: @data is not compressible
227-
* 1: @data is compressible
228-
* -ENOMEM: failed to allocate memory for sample buffer
222+
* Return: true if @data is compressible, false otherwise.
229223
*
230224
* Tests shows that this function is quite reliable in predicting data compressibility,
231225
* matching close to 1:1 with the behaviour of LZ77 compression success and failures.
232226
*/
233-
static int is_compressible(const struct iov_iter *data)
227+
static bool is_compressible(const struct iov_iter *data)
234228
{
235229
const size_t read_size = SZ_2K, bkt_size = 256, max = SZ_4M;
236230
struct bucket *bkt = NULL;
237-
int i = 0, ret = 0;
238231
size_t len;
239232
u8 *sample;
233+
bool ret = false;
234+
int i;
240235

236+
/* Preventive double check -- already checked in should_compress(). */
241237
len = iov_iter_count(data);
242-
if (len < read_size)
243-
return 0;
238+
if (unlikely(len < read_size))
239+
return ret;
244240

245241
if (len - read_size > max)
246242
len = max;
247243

248244
sample = kvzalloc(len, GFP_KERNEL);
249-
if (!sample)
250-
return -ENOMEM;
245+
if (!sample) {
246+
WARN_ON_ONCE(1);
247+
248+
return ret;
249+
}
251250

252251
/* Sample 2K bytes per page of the uncompressed data. */
253-
ret = collect_sample(data, len, sample);
254-
if (ret < 0)
252+
i = collect_sample(data, len, sample);
253+
if (i <= 0) {
254+
WARN_ON_ONCE(1);
255+
255256
goto out;
257+
}
256258

257-
len = ret;
258-
ret = 1;
259+
len = i;
260+
ret = true;
259261

260-
if (check_repeated_data(sample, len))
262+
if (has_repeated_data(sample, len))
261263
goto out;
262264

263265
bkt = kcalloc(bkt_size, sizeof(*bkt), GFP_KERNEL);
264266
if (!bkt) {
265-
kvfree(sample);
266-
return -ENOMEM;
267+
WARN_ON_ONCE(1);
268+
ret = false;
269+
270+
goto out;
267271
}
268272

269273
for (i = 0; i < len; i++)
270274
bkt[sample[i]].count++;
271275

272-
if (check_ascii_bytes(bkt))
276+
if (is_mostly_ascii(bkt))
273277
goto out;
274278

275279
/* Sort in descending order */
276280
sort(bkt, bkt_size, sizeof(*bkt), cmp_bkt, NULL);
277281

278-
ret = calc_byte_distribution(bkt, len);
279-
if (ret != 2)
282+
i = calc_byte_distribution(bkt, len);
283+
if (i != BYTE_DIST_MAYBE) {
284+
ret = !!i;
285+
280286
goto out;
287+
}
281288

282-
ret = calc_shannon_entropy(bkt, len);
289+
ret = has_low_entropy(bkt, len);
283290
out:
284291
kvfree(sample);
285292
kfree(bkt);
286293

287-
WARN(ret < 0, "%s: ret=%d\n", __func__, ret);
288-
289-
return !!ret;
294+
return ret;
290295
}
291296

292297
bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq)
@@ -305,7 +310,7 @@ bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq)
305310
if (shdr->Command == SMB2_WRITE) {
306311
const struct smb2_write_req *wreq = rq->rq_iov->iov_base;
307312

308-
if (wreq->Length < SMB_COMPRESS_MIN_LEN)
313+
if (le32_to_cpu(wreq->Length) < SMB_COMPRESS_MIN_LEN)
309314
return false;
310315

311316
return is_compressible(&rq->rq_iter);

0 commit comments

Comments
 (0)