Skip to content

Commit f2f56c9

Browse files
committed
Update: seeding
1 parent 3baaf04 commit f2f56c9

File tree

2 files changed

+41
-80
lines changed

2 files changed

+41
-80
lines changed

kaleidoscope/algorithms/randomize.py

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,33 +9,12 @@
99

1010
import dask.array as da
1111
import numpy as np
12-
from numpy.random import SeedSequence
1312

1413
from ..generators import DefaultNormal
1514
from ..interface.algorithm import InformedBlockAlgorithm
1615
from ..interface.generating import Normal
1716

1817

19-
def _block_seed(
20-
block_id: tuple[int, ...], root_seed: np.ndarray
21-
) -> np.ndarray:
22-
"""Returns a random seed array for a given block."""
23-
work_seed = SeedSequence(_hash(block_id)).generate_state(1)
24-
return np.array([i for i in work_seed] + [i for i in root_seed])
25-
26-
27-
def _hash(block_id: tuple[int, ...]) -> int:
28-
"""
29-
Daniel J. Bernstein hash function.
30-
31-
Returns a positive hash value.
32-
"""
33-
h = 5381
34-
for i in block_id:
35-
h = ((h << 5) + h) + i
36-
return h
37-
38-
3918
def _chlorophyll(
4019
seed: np.ndarray, x: np.ndarray, u: np.ndarray
4120
) -> np.ndarray:
@@ -78,19 +57,19 @@ def __init__(
7857
dtype: np.dtype = np.single,
7958
m: int = 2,
8059
dist: Literal["normal", "lognormal", "chlorophyll"] | str = "normal",
81-
entropy: int | list[int] | None = None,
60+
seed: np.ndarray | None = None,
8261
):
8362
"""
8463
Creates a new algorithm instance.
8564
8665
:param dtype: The result data type.
8766
:param m: The number of input data dimensions.
8867
:param dist: The type of measurement error distribution.
89-
:param entropy: The entropy to create the seed sequence.
68+
:param seed: The root seed.
9069
"""
9170
super().__init__(dtype, m, m)
9271
self._dist = dist
93-
self._root_seed = SeedSequence(entropy).generate_state(8)
72+
self._root_seed = seed
9473

9574
def chunks(self, *inputs: da.Array) -> tuple[int, ...] | None:
9675
return None
@@ -120,7 +99,7 @@ def randomize(
12099
:param clip: Where to clip measurement errors.
121100
:return: The measurement values randomized.
122101
"""
123-
seed = _block_seed(kwargs["block_id"], self._root_seed)
102+
seed = self.block_seed(kwargs["block_id"])
124103

125104
x = data[0]
126105
u = (
@@ -143,10 +122,14 @@ def randomize(
143122
y = x
144123
if clip is not None:
145124
y = np.clip(y, a_min=clip[0], a_max=clip[1])
146-
return y
125+
return np.where(np.isfinite(y), y, x)
147126

148127
compute_block = randomize
149128

129+
def block_seed(self, block_id: tuple[int, ...]) -> np.ndarray:
130+
"""Returns the block seed."""
131+
return np.array([i for i in block_id] + [i for i in self._root_seed])
132+
150133
@property
151134
def name(self) -> str:
152135
return "randomize"

kaleidoscope/operators/randomizeop.py

Lines changed: 32 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,6 @@
2424
from ..logger import get_logger
2525

2626

27-
def _hash(name: str) -> int:
28-
"""
29-
Daniel J. Bernstein hash function.
30-
31-
Returns a positive hash value.
32-
"""
33-
h = 5381
34-
for c in name:
35-
h = ((h << 5) + h) + ord(c)
36-
return h
37-
38-
3927
def _decode(x: da.Array, a: dict[str:Any]) -> da.Array:
4028
"""Returns decoded data."""
4129
f = Decode(np.single if x.dtype == np.single else np.double, x.ndim)
@@ -101,7 +89,7 @@ def run(self, source: Dataset) -> Dataset: # noqa: D102
10189
if v not in config or self._args.selector == 0:
10290
continue
10391
get_logger().info(f"starting graph for variable: {v}")
104-
self.randomize(source, target, v, x, config[v])
92+
self.randomize(target, v, x, config[v])
10593
get_logger().info(f"finished graph for variable: {v}")
10694
return target
10795

@@ -116,28 +104,8 @@ def config(self) -> dict[str : dict[str:Any]]:
116104
config = json.load(r)
117105
return config
118106

119-
# noinspection PyShadowingNames
120-
def entropy(self, name: str, uuid: str, n: int = 4) -> list[int]:
121-
"""
122-
Returns the entropy of the seed sequence used for a given variable.
123-
124-
Entropy is generated using the Philox bit generator, which produces
125-
truly independent sequences for different values of the seed.
126-
127-
:param name: The variable name.
128-
:param uuid: The dataset UUID.
129-
:param n: The length of the seed sequence.
130-
:return: The entropy.
131-
"""
132-
from numpy.random import Philox
133-
134-
seed = _hash(f"{name}-{uuid}") + self._args.selector
135-
g = DefaultGenerator(Philox(seed))
136-
return [g.next() for _ in range(n)]
137-
138107
def randomize(
139108
self,
140-
source: Dataset,
141109
target: Dataset,
142110
v: str,
143111
x: DataArray,
@@ -146,22 +114,14 @@ def randomize(
146114
"""
147115
Creates the graph to randomize a variable.
148116
149-
:param source: The source dataset.
150117
:param target: The target dataset.
151118
:param v: The name of the variable.
152119
:param x: The data of the variable.
153120
:param config: The randomization configuration.
154121
"""
155-
if "total" in config:
156-
s: list[int] = []
157-
z = _decode(x.data, x.attrs)
158-
for ref in config["total"]:
159-
a = _decode(target[ref].data, target[ref].attrs)
160-
b = _decode(source[ref].data, source[ref].attrs)
161-
z = z + (a - b)
162-
elif "uncertainty" in config:
163-
s: list[int] = self.entropy(v, self.uuid)
164-
f = Randomize(m=x.ndim, dist=config["distribution"], entropy=s)
122+
if "uncertainty" in config:
123+
s = self.seed(self.uuid(v))
124+
f = Randomize(m=x.ndim, dist=config["distribution"], seed=s)
165125
u = (
166126
target[config["uncertainty"]]
167127
if isinstance(config["uncertainty"], str)
@@ -182,8 +142,8 @@ def randomize(
182142
clip=config.get("clip", None),
183143
)
184144
else:
185-
s: list[int] = self.entropy(v, self.uuid)
186-
f = Randomize(m=x.ndim, dist=config["distribution"], entropy=s)
145+
s = self.seed(self.uuid(v))
146+
f = Randomize(m=x.ndim, dist=config["distribution"], seed=s)
187147
b = target[config["bias"]]
188148
r = target[config["rmsd"]]
189149
z = f.apply_to(
@@ -206,20 +166,38 @@ def randomize(
206166
],
207167
dtype=z.dtype,
208168
)
209-
if s:
210-
target[v].attrs["entropy"] = np.array(s, dtype=np.int64)
169+
target[v].attrs["seed"] = s
211170
if get_logger().is_enabled(Logging.DEBUG):
212-
get_logger().debug(f"entropy: {s}")
171+
get_logger().debug(f"seed: {s}")
213172
get_logger().debug(f"min: {da.nanmin(z).compute() :.3f}")
214173
get_logger().debug(f"max: {da.nanmax(z).compute() :.3f}")
215174
get_logger().debug(f"mean: {da.nanmean(z).compute() :.3f}")
216175
get_logger().debug(f"std: {da.nanstd(z).compute() :.3f}")
217176

218-
@property
219-
def uuid(self) -> str:
177+
# noinspection PyShadowingNames
178+
def seed(self, uuid: uuid.UUID, n: int = 4) -> np.ndarray:
179+
"""
180+
Returns the seed sequence used for a given variable.
181+
182+
The seed sequence is generated using the Philox bit generator,
183+
which produces truly independent sequences of random numbers for
184+
different values of the seed.
185+
186+
:param uuid: The variable and dataset UUID.
187+
:param n: The length of the seed sequence.
188+
:return: The seed sequence.
189+
"""
190+
from numpy.random import Philox
191+
192+
seed = uuid.int + self._args.selector
193+
g = DefaultGenerator(Philox(seed))
194+
return np.array([g.next() for _ in range(n)], dtype=np.int64)
195+
196+
def uuid(self, v: str) -> uuid.UUID:
220197
"""
221-
Returns a UUID constructed from the basename of the source file.
198+
Returns a UUID constructed from the variable name and the
199+
basename of the source file.
222200
"""
223-
return (
224-
f"{uuid.uuid5(uuid.NAMESPACE_URL, self._args.source_file.stem)}"
201+
return uuid.uuid5(
202+
uuid.NAMESPACE_URL, f"{v}-{self._args.source_file.stem}"
225203
)

0 commit comments

Comments
 (0)