Skip to content

Commit 9592708

Browse files
committed
Fix gzip_compression is_solution to canonicalize bytes and prevent __len__ spoofing
1 parent 00f3676 commit 9592708

File tree

1 file changed

+12
-31
lines changed

1 file changed

+12
-31
lines changed

AlgoTuneTasks/gzip_compression/gzip_compression.py

Lines changed: 12 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -314,17 +314,10 @@ def is_solution(self, problem: dict[str, Any], solution: dict[str, bytes] | Any)
314314
Verify the provided gzip compression solution.
315315
316316
Checks:
317-
1. The solution format is valid (dict with 'compressed_data' as bytes).
318-
2. Decompressing the solution's data yields the original plaintext.
319-
3. The length of the compressed data in the solution is at most
320-
machine epsilon larger than the length produced by self.solve().
321-
322-
Args:
323-
problem (dict): The problem dictionary.
324-
solution (dict): The proposed solution dictionary with 'compressed_data'.
325-
326-
Returns:
327-
bool: True if the solution is valid and meets the criteria.
317+
1. The solution format is valid (dict with 'compressed_data' as bytes-like).
318+
2. Decompressing the solution yields the original plaintext.
319+
3. The compressed length is at most 0.1% larger than the reference output
320+
produced by gzip.compress(..., compresslevel=9, mtime=0).
328321
"""
329322
if not isinstance(solution, dict) or "compressed_data" not in solution:
330323
logging.error(
@@ -337,12 +330,15 @@ def is_solution(self, problem: dict[str, Any], solution: dict[str, bytes] | Any)
337330
logging.error("Solution 'compressed_data' is not bytes.")
338331
return False
339332

333+
# Canonicalize: prevents bytes subclasses from spoofing __len__ (and similar).
334+
compressed_data = bytes(compressed_data)
335+
340336
original_plaintext = problem.get("plaintext")
341337
if original_plaintext is None:
342338
logging.error("Problem dictionary missing 'plaintext'. Cannot verify.")
343-
return False # Cannot verify without original data
339+
return False
344340

345-
# 1. Check if decompression yields the original input
341+
# 1) Check decompression matches original
346342
try:
347343
decompressed_data = gzip.decompress(compressed_data)
348344
except Exception as e:
@@ -351,11 +347,9 @@ def is_solution(self, problem: dict[str, Any], solution: dict[str, bytes] | Any)
351347

352348
if decompressed_data != original_plaintext:
353349
logging.error("Decompressed data does not match original plaintext.")
354-
# Log lengths for debugging
355350
logging.debug(
356351
f"Original length: {len(original_plaintext)}, Decompressed length: {len(decompressed_data)}"
357352
)
358-
# Log first/last few bytes if lengths match but content differs
359353
if len(decompressed_data) == len(original_plaintext):
360354
logging.debug(
361355
f"Original start: {original_plaintext[:50]}, Decompressed start: {decompressed_data[:50]}"
@@ -365,35 +359,22 @@ def is_solution(self, problem: dict[str, Any], solution: dict[str, bytes] | Any)
365359
)
366360
return False
367361

368-
# 2. Check if the compressed size is close to the reference solution size
369-
# Generate reference solution using the same compression settings.
362+
# 2) Size constraint vs reference
370363
try:
371-
# reference_solution = self.solve(problem) # Use direct compression here to avoid recursion if solve changes
372364
reference_compressed_data = gzip.compress(original_plaintext, compresslevel=9, mtime=0)
373365
except Exception as e:
374366
logging.error(f"Failed to generate reference solution in is_solution: {e}")
375-
# Cannot verify size constraint if reference generation fails
376367
return False
377368

378369
solution_len = len(compressed_data)
379370
reference_len = len(reference_compressed_data)
380-
381-
# Allow solution length to be at most 0.1% larger than reference length.
382-
# Calculate the maximum allowed length (reference + 0.1%)
383-
# Use math.ceil to allow the integer length to reach the ceiling of the limit.
384371
max_allowed_len = math.ceil(reference_len * 1.001)
385372

386-
# Calculate compression ratios for logging
387-
# original_len = len(original_plaintext)
388-
# Avoid division by zero if original_plaintext is empty
389-
# ref_ratio = (reference_len / original_len) if original_len > 0 else float('inf')
390-
# sol_ratio = (solution_len / original_len) if original_len > 0 else float('inf')
391-
392373
if solution_len > max_allowed_len:
393374
logging.error(
394-
f"Compressed data length ({solution_len}) is more than 0.1% larger than reference length ({reference_len}). Max allowed: {max_allowed_len}."
375+
f"Compressed data length ({solution_len}) is more than 0.1% larger than reference "
376+
f"length ({reference_len}). Max allowed: {max_allowed_len}."
395377
)
396378
return False
397379

398-
# All checks passed
399380
return True

0 commit comments

Comments
 (0)