Skip to content

Optimise (non?) iteration of empty dicts/collections ? #720

@stuaxo

Description

@stuaxo

I'm a big fan of collections defaulting to being empty and not None, and I a lot APIs default to this, whether you are returning a dict, or some sort of queryset from a database.

Then I can always

for k, v in collection.items():
    ...

It occurs to me, there must be a lot of empty dicts across python itself and I wonder if there is a performance hit, is it faster to do the loop below if we default to None, or if we default to empty.

if collection:
    for k, v in collection.items():
        ...

I've added a very basic benchmark, and it does seem quicker with the if when run only once, which is probably the case often in web services that are coming up, and in scripts as they start or commandline python tools.

import timeit
import statistics
from collections import defaultdict

def iterate_empty():
    """Iterate over an empty dict directly."""
    empty_dict = {}
    result = 0
    for key, value in empty_dict.items():
        result += value
    return result

def check_then_iterate():
    """Check if dict is empty before iterating."""
    empty_dict = {}
    result = 0
    if empty_dict:
        for key, value in empty_dict.items():
            result += value
    return result

def none_then_check_then_iterate():
    """Use None as default, check for None, then check if empty before iterating."""
    empty_dict = None
    result = 0
    if empty_dict is not None:
        if empty_dict:
            for key, value in empty_dict.items():
                result += value
    return result

def defaultdict_iterate():
    """Use defaultdict and iterate directly."""
    empty_dict = defaultdict(int)
    result = 0
    for key, value in empty_dict.items():
        result += value
    return result

def iterate_non_empty():
    """Baseline: Iterate over a dict with one item."""
    non_empty = {'a': 1}
    result = 0
    for key, value in non_empty.items():
        result += value
    return result

def check_then_iterate_non_empty():
    """Baseline: Check then iterate over a dict with one item."""
    non_empty = {'a': 1}
    result = 0
    if non_empty:
        for key, value in non_empty.items():
            result += value
    return result

# Number of iterations for each test
iterations = 1000000
number_of_runs = 7

# Run benchmarks
results = {}

print("Running benchmarks (lower is better)...")

for func_name, func in [
    ("iterate_empty", iterate_empty),
    ("check_then_iterate", check_then_iterate),
    ("none_then_check_then_iterate", none_then_check_then_iterate),
    ("defaultdict_iterate", defaultdict_iterate),
    ("iterate_non_empty", iterate_non_empty),
    ("check_then_iterate_non_empty", check_then_iterate_non_empty),
]:
    times = []
    for _ in range(number_of_runs):
        time_taken = timeit.timeit(func, number=iterations)
        times.append(time_taken)
    
    # Remove the highest and lowest times and calculate the average
    times.sort()
    trimmed_times = times[1:-1]
    avg_time = statistics.mean(trimmed_times)
    std_dev = statistics.stdev(trimmed_times) if len(trimmed_times) > 1 else 0
    
    results[func_name] = (avg_time, std_dev)
    print(f"{func_name}: {avg_time:.6f} seconds (±{std_dev:.6f})")

# Calculate relative performance
baseline = results["iterate_empty"]
print("\nRelative performance (compared to iterate_empty):")
for name, (time, std_dev) in results.items():
    relative = time / baseline[0]
    print(f"{name}: {relative:.2f}x ({time/iterations*1e9:.2f} ns per operation)")

# Now test with dictionaries of different sizes
print("\n===== Testing with different dictionary sizes =====")

def test_with_size(size):
    # Create dictionaries of specified size
    test_dict = {i: i for i in range(size)}
    
    def iterate_dict():
        result = 0
        for _, value in test_dict.items():
            result += value
        return result
    
    def check_then_iterate_dict():
        result = 0
        if test_dict:
            for _, value in test_dict.items():
                result += value
        return result
    
    iterate_time = timeit.timeit(iterate_dict, number=100000)
    check_time = timeit.timeit(check_then_iterate_dict, number=100000)
    
    print(f"Size {size}:")
    print(f"  Direct iteration: {iterate_time:.6f} seconds")
    print(f"  Check then iterate: {check_time:.6f} seconds")
    print(f"  Ratio (check/direct): {check_time/iterate_time:.2f}x")

# Test with different sizes
for size in [0, 1, 10, 100, 1000]:
    test_with_size(size)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions