@@ -43,7 +43,10 @@ def sliding_min_max(
4343 min_periods : int ,
4444 is_max : bool ,
4545) -> tuple [np .ndarray , list [int ]]:
46- # numba-only init part
46+ # Basic idea of the algorithm: https://stackoverflow.com/a/12239580
47+ # It was generalized to work with an arbitrary list of any window size and position
48+ # by adding the Dominators stack.
49+
4750 N = len (start )
4851 na_pos = []
4952 output = np .empty (N , dtype = result_dtype )
@@ -54,59 +57,24 @@ def cmp(a: Any, b: Any, is_max: bool) -> bool:
5457 else :
5558 return a <= b
5659
57- # All comments below are for the case of a maximum, that is, is_max = True.
58- # I will call this Q a "stash": preliminary calculations minimally necessary to
59- # finish the job. Q will always be in ascending order regardless of min/max:
60- # these are indices into the "values" array. The values[Q[i]], however, will
61- # be in non-descending order for max, and non-ascending for min.
62- # Think of this deque as indices of maximums found so far for varying window
63- # positions. That is, there is only one maximum in the source array, but it may
64- # not fit each window. So there are many "secondary maximums", and each of them
65- # may potentially fit multiple windows (unless, of course you get a very special
66- # case of an arary of strictly descending values and constant rolling window size).
67- # In that case Q will be the longest, so here is an idea for the worst case
68- # scenario testing.
69-
70- # We have to pretend, given that Numba has neither queue nor stack.
7160 Q : list = [] # this is a queue
7261 Dominators : list = [] # this is a stack
73- # END-OF numba-only init part
74-
75- # Basic idea of the algorithm: https://stackoverflow.com/a/12239580
76- # It was generalized to work with an arbitrary list of any window size and position
7762
78- # Zero is apparently passed here as a default.
79- # It is important to have this value precise for calculations.
8063 if min_periods < 1 :
8164 min_periods = 1
8265
83- # We will say that node j "dominates" node i if j comes after i, yet requires a
84- # deeper deque Q at the time node i is processed in order to be able to finish
85- # the job for node j. This precisely means the following two conditions:
86- # - j > i
87- # - start[j] < start[i].
88- # We keep track of such nodes in the Dominators queue.
89- # In addition, if it so happens that two nodes j1 and j2 dominate another node,
90- # and j2 > j1, yet start[j2] <= start[j1], then we only need to keep track of j2.
91- # (To understand why this works, note that the algorithm requires that
92- # the "end" array is sorted in non-descending order, among other things.)
9366 if N > 2 :
9467 i_next = N - 1 # equivalent to i_next = i+1 inside the loop
9568 for i in range (N - 2 , - 1 , - 1 ):
9669 next_dominates = start [i_next ] < start [i ]
9770 if next_dominates and (
9871 not Dominators or start [Dominators [- 1 ]] > start [i_next ]
9972 ):
100- # Both ">" and ">=" would have been (logically) equivalent, but we are
101- # shooting for the shortest size of the Dominators list, hence the
102- # usage of ">"
10373 Dominators .append (i_next )
10474 i_next = i
10575
10676 valid_start = - min_periods
10777
108- # Having these relieves us from having "if i>0" on each iteration for special
109- # handling
11078 last_end = 0
11179 last_start = - 1
11280
@@ -117,26 +85,16 @@ def cmp(a: Any, b: Any, is_max: bool) -> bool:
11785 if Dominators and Dominators [- 1 ] == i :
11886 Dominators .pop ()
11987
120- # TODO: Arguably there are benefits to having this consistency check before
121- # this function is even called (e.g. in rolling.py).
122- # Given the current implementation, it will be rather tricky at the moment
123- # to have this check in rolling.py. Additionally, this is only required for
124- # min/max, and may only ever be violated if user-defined window indexer is
125- # used. Thus this is the best spot for it, given the circumstances.
12688 if not (
12789 this_end > last_end or (this_end == last_end and this_start >= last_start )
12890 ):
12991 raise ValueError (
13092 "Start/End ordering requirement is violated at index " + str (i )
13193 )
13294
133- # This is the least restrictive starting index that will take care of current
134- # item (i) and all remaining items
13595 stash_start = (
13696 this_start if not Dominators else min (this_start , start [Dominators [- 1 ]])
13797 )
138- # Discard entries outside of the "needed" window. Do it first as to keep the
139- # stash small.
14098 while Q and Q [0 ] < stash_start :
14199 Q .pop (0 )
142100
@@ -150,29 +108,13 @@ def cmp(a: Any, b: Any, is_max: bool) -> bool:
150108 Q .append (k ) # Q.push_back(k)
151109
152110 if not Q or (this_start > valid_start ):
153- # The "not Q" condition means we have not seen anything but NaNs yet in
154- # values[:this_end-1]. The "this_start > valid_start" condition means we
155- # have not accumulated enough (min_periods or more) non-NaN values.
156111 if values .dtype .kind != "i" :
157112 output [i ] = np .nan
158113 else :
159114 na_pos .append (i )
160115 elif Q [0 ] >= this_start :
161- # This is the only read-from-the-stash scenario that ever happens when
162- # window size is constant across the set. This is also likely 99+% of
163- # all use cases, thus we want to make sure we do not go into bisection
164- # as to incur neither the *log(k) penalty nor the function call penalty
165- # for this very common case. If we are here, then our stash is as "deep"
166- # as what the current node ("job") requires. Thus take the front item.
167116 output [i ] = values [Q [0 ]]
168117 else :
169- # In this case our stash is bigger than what is necessary to compute this
170- # node's output due to a wider search window at (one of) the nodes that
171- # follow. We have to locate our value in the middle of the stash.
172- # Since our stash is sorted, we can use binary search:
173- # here we need to output the item closest to the front (idx=0) of the
174- # stash that fits our window bounds. Item 0 has been looked at (and
175- # discarded) by now, so lo=1
176118 q_idx = bisect_left (Q , this_start , lo = 1 )
177119 output [i ] = values [Q [q_idx ]]
178120 last_end = this_end
0 commit comments