Skip to content

perf: Port FrozenOrderedSet to rust#23200

Open
tobni wants to merge 2 commits intopantsbuild:mainfrom
tobni:add/port-frozen-ordered-set
Open

perf: Port FrozenOrderedSet to rust#23200
tobni wants to merge 2 commits intopantsbuild:mainfrom
tobni:add/port-frozen-ordered-set

Conversation

@tobni
Copy link
Copy Markdown
Contributor

@tobni tobni commented Mar 29, 2026

Followup to #22501. Same approach — FrozenOrderedSet is now a pyo3 #[pyclass] backed by Py<PyDict> with lazy hash via OnceLock. The end goal is porting more rule code to rust intrinsics.

"""Benchmark: Rust FrozenOrderedSet vs Python FrozenOrderedSet."""

import sys
import timeit
from collections.abc import Hashable, Iterable, Iterator
from typing import AbstractSet, Any, TypeVar

sys.path.insert(0, "src/python")

from pants.engine.internals.native_engine import FrozenOrderedSet as RustFrozenOrderedSet

T = TypeVar("T")


class PyFrozenOrderedSet(AbstractSet[T], Hashable):
    """The old pure-Python FrozenOrderedSet (pre-port)."""

    def __init__(self, iterable=None):
        self._items = dict.fromkeys(iterable) if iterable else {}
        self._hash = None

    def __len__(self):
        return len(self._items)

    def __contains__(self, key):
        return key in self._items

    def __iter__(self) -> Iterator:
        return iter(self._items)

    def __reversed__(self):
        return reversed(tuple(self._items.keys()))

    def __eq__(self, other):
        if not isinstance(other, self.__class__):
            return NotImplemented
        return len(self._items) == len(other._items) and all(
            x == y for x, y in zip(self._items, other._items)
        )

    def __hash__(self):
        if self._hash is None:
            self._hash = 0
            for item in self._items.keys():
                self._hash ^= hash(item)
        return self._hash

    def __repr__(self):
        return f"PyFrozenOrderedSet({list(self)!r})"

    def __bool__(self):
        return bool(self._items)

    def union(self, other):
        return self.__class__(list(self) + [x for x in other if x not in self._items])

    def intersection(self, other):
        s = set(other)
        return self.__class__(x for x in self if x in s)

    def difference(self, other):
        s = set(other)
        return self.__class__(x for x in self if x not in s)

    def issubset(self, other):
        if len(self) > len(other):
            return False
        return all(item in other for item in self)


WARMUP = 1000

def measure(stmt, number, globs):
    timeit.timeit(stmt, number=WARMUP, globals=globs)
    t = timeit.timeit(stmt, number=number, globals=globs)
    return t / number * 1_000_000


BENCHMARKS = [
    ("Construction", "Cls(data)", lambda data, py, rs, **_: [
        {"Cls": PyFrozenOrderedSet, "data": data},
        {"Cls": RustFrozenOrderedSet, "data": data},
    ]),
    ("hash()", "hash(fd)", lambda py, rs, **_: [
        {"fd": py},
        {"fd": rs},
    ]),
    ("__contains__", "k in fd", lambda py, rs, mid, **_: [
        {"fd": py, "k": mid},
        {"fd": rs, "k": mid},
    ]),
    ("__contains__ miss", "k in fd", lambda py, rs, **_: [
        {"fd": py, "k": "MISSING"},
        {"fd": rs, "k": "MISSING"},
    ]),
    ("__eq__", "fd == fd2", lambda py, rs, py2, rs2, **_: [
        {"fd": py, "fd2": py2},
        {"fd": rs, "fd2": rs2},
    ]),
    ("iteration", "list(fd)", lambda py, rs, **_: [
        {"fd": py},
        {"fd": rs},
    ]),
    ("union", "fd.union(other)", lambda py, rs, py_other, rs_other, **_: [
        {"fd": py, "other": py_other},
        {"fd": rs, "other": rs_other},
    ]),
    ("intersection", "fd.intersection(other)", lambda py, rs, py_other, rs_other, **_: [
        {"fd": py, "other": py_other},
        {"fd": rs, "other": rs_other},
    ]),
    ("difference", "fd.difference(other)", lambda py, rs, py_other, rs_other, **_: [
        {"fd": py, "other": py_other},
        {"fd": rs, "other": rs_other},
    ]),
    ("issubset", "small.issubset(fd)", lambda py, rs, py_small, rs_small, **_: [
        {"small": py_small, "fd": py},
        {"small": rs_small, "fd": rs},
    ]),
    ("dict key", "d[fd]", lambda py, rs, **_: [
        {"fd": py, "d": {py: 1}},
        {"fd": rs, "d": {rs: 1}},
    ]),
]

SMALL = list(range(5))
MEDIUM = list(range(20))
LARGE = list(range(200))

DATASETS = [("small (5)", SMALL), ("medium (20)", MEDIUM), ("large (200)", LARGE)]

all_results: dict[str, dict[str, tuple[float, float]]] = {}

for ds_name, data in DATASETS:
    print(f"\n{'=' * 60}")
    print(f"  Dataset: {ds_name}")
    print(f"{'=' * 60}")

    py = PyFrozenOrderedSet(data)
    rs = RustFrozenOrderedSet(data)
    py2 = PyFrozenOrderedSet(data)
    rs2 = RustFrozenOrderedSet(data)
    half = data[:len(data) // 2]
    py_other = PyFrozenOrderedSet(half + list(range(1000, 1000 + len(half))))
    rs_other = RustFrozenOrderedSet(half + list(range(1000, 1000 + len(half))))
    py_small = PyFrozenOrderedSet(data[:3])
    rs_small = RustFrozenOrderedSet(data[:3])
    # Warm up lazy hashes
    for obj in (py, py2, py_other, py_small):
        hash(obj)
    n = 500_000 if len(data) <= 20 else 50_000
    mid = data[len(data) // 2]

    ctx = dict(data=data, py=py, rs=rs, py2=py2, rs2=rs2, mid=mid,
               py_other=py_other, rs_other=rs_other, py_small=py_small, rs_small=rs_small)

    for bench_name, stmt, make_globs in BENCHMARKS:
        py_globs, rs_globs = make_globs(**ctx)
        py_us = measure(stmt, n, py_globs)
        rs_us = measure(stmt, n, rs_globs)
        print(f"  {bench_name:.<20s} Python {py_us:8.3f} µs  Rust {rs_us:8.3f} µs  ({py_us / rs_us:.1f}x)")
        all_results.setdefault(bench_name, {})[ds_name] = (py_us, rs_us)

ds_names = [name for name, _ in DATASETS]
header = f"  {'Operation':<20s}" + "".join(f" | {name:>12s}" for name in ds_names)
sep = f"  {'-'*20}" + "".join(f"-+-{'-'*12}" for _ in ds_names)

print(f"\n{'=' * 60}")
print("  Summary (Python / Rust speedup)")
print(f"{'=' * 60}")
print(header)
print(sep)
for bench_name, _, _ in BENCHMARKS:
    row = f"  {bench_name:<20s}"
    for ds_name in ds_names:
        py_us, rs_us = all_results[bench_name][ds_name]
        ratio = py_us / rs_us
        row += f" | {ratio:11.1f}x"
    print(row)
  Operation            |    small (5) |  medium (20) |  large (200)
  ---------------------+--------------+--------------+-------------
  Construction         |         1.2x |         1.1x |         1.0x
  hash()               |         2.2x |         2.2x |         2.2x
  __contains__         |         1.8x |         1.7x |         1.7x
  __contains__ miss    |         1.8x |         1.8x |         1.8x
  __eq__               |         3.6x |         2.0x |         1.4x
  iteration            |         1.5x |         1.4x |         1.1x
  union                |         4.1x |         3.0x |         2.5x
  intersection         |         2.7x |         1.7x |         1.2x
  difference           |         3.0x |         1.8x |         1.3x
  issubset             |         7.1x |         6.8x |         7.0x
  dict key             |         2.1x |         2.0x |         2.0x

@tobni tobni force-pushed the add/port-frozen-ordered-set branch from 42a564e to ca376c9 Compare March 29, 2026 13:25
@tobni tobni added category:internal CI, fixes for not-yet-released features, etc. release-notes:not-required [CI] PR doesn't require mention in release notes labels Mar 29, 2026
@tobni tobni force-pushed the add/port-frozen-ordered-set branch 6 times, most recently from 027bd5b to ee17c67 Compare March 29, 2026 17:59
@tobni tobni force-pushed the add/port-frozen-ordered-set branch from ee17c67 to 0bf4cba Compare March 29, 2026 18:45
@cburroughs
Copy link
Copy Markdown
Contributor

Really wish I knew rust better for all these cool performance cases. Cross referencing: #14719

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

category:internal CI, fixes for not-yet-released features, etc. release-notes:not-required [CI] PR doesn't require mention in release notes

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants