1
+ from __future__ import division
2
+
1
3
import numbers
4
+ from operator import mul
5
+ from functools import reduce
6
+
2
7
import numpy as np
3
8
9
+ MEGABYTE = 1024 * 1024
10
+
4
11
5
12
def is_array_sequence (obj ):
6
13
""" Return True if `obj` is an array sequence. """
@@ -16,6 +23,26 @@ def is_ndarray_of_int_or_bool(obj):
16
23
np .issubdtype (obj .dtype , np .bool )))
17
24
18
25
26
+ class _BuildCache (object ):
27
+ def __init__ (self , arr_seq , common_shape , dtype ):
28
+ self .offsets = list (arr_seq ._offsets )
29
+ self .lengths = list (arr_seq ._lengths )
30
+ self .next_offset = arr_seq ._get_next_offset ()
31
+ self .bytes_per_buf = arr_seq ._buffer_size * MEGABYTE
32
+ self .dtype = dtype
33
+ if arr_seq .common_shape != () and common_shape != arr_seq .common_shape :
34
+ raise ValueError (
35
+ "All dimensions, except the first one, must match exactly" )
36
+ self .common_shape = common_shape
37
+ n_in_row = reduce (mul , common_shape , 1 )
38
+ bytes_per_row = n_in_row * dtype .itemsize
39
+ self .rows_per_buf = bytes_per_row / self .bytes_per_buf
40
+
41
+ def update_seq (self , arr_seq ):
42
+ arr_seq ._offsets = np .array (self .offsets )
43
+ arr_seq ._lengths = np .array (self .lengths )
44
+
45
+
19
46
class ArraySequence (object ):
20
47
""" Sequence of ndarrays having variable first dimension sizes.
21
48
@@ -48,6 +75,8 @@ def __init__(self, iterable=None, buffer_size=4):
48
75
self ._data = np .array ([])
49
76
self ._offsets = np .array ([], dtype = np .intp )
50
77
self ._lengths = np .array ([], dtype = np .intp )
78
+ self ._buffer_size = buffer_size
79
+ self ._build_cache = None
51
80
52
81
if iterable is None :
53
82
return
@@ -60,25 +89,24 @@ def __init__(self, iterable=None, buffer_size=4):
60
89
self ._is_view = True
61
90
return
62
91
92
+ # If possible try pre-allocating memory.
63
93
try :
64
- # If possible try pre-allocating memory.
65
- if len (iterable ) > 0 :
66
- first_element = np .asarray (iterable [0 ])
67
- n_elements = np .sum ([len (iterable [i ])
68
- for i in range (len (iterable ))])
69
- new_shape = (n_elements ,) + first_element .shape [1 :]
70
- self ._data = np .empty (new_shape , dtype = first_element .dtype )
94
+ iter_len = len (iterable )
71
95
except TypeError :
72
96
pass
73
-
74
- # Initialize the `ArraySequence` object from iterable's item.
75
- coroutine = self ._extend_using_coroutine ()
76
- coroutine .send (None ) # Run until the first yield.
97
+ else : # We do know the iterable length
98
+ if iter_len == 0 :
99
+ return
100
+ first_element = np .asarray (iterable [0 ])
101
+ n_elements = np .sum ([len (iterable [i ])
102
+ for i in range (len (iterable ))])
103
+ new_shape = (n_elements ,) + first_element .shape [1 :]
104
+ self ._data = np .empty (new_shape , dtype = first_element .dtype )
77
105
78
106
for e in iterable :
79
- coroutine . send ( e )
107
+ self . append ( e , cache_build = True )
80
108
81
- coroutine . close () # Terminate coroutine.
109
+ self . finalize_append ()
82
110
83
111
@property
84
112
def is_array_sequence (self ):
@@ -92,21 +120,40 @@ def common_shape(self):
92
120
@property
93
121
def nb_elements (self ):
94
122
""" Total number of elements in this array sequence. """
95
- return self ._data . shape [ 0 ]
123
+ return np . sum ( self ._lengths )
96
124
97
125
@property
98
126
def data (self ):
99
127
""" Elements in this array sequence. """
100
128
return self ._data
101
129
102
- def append (self , element ):
130
+ def _get_next_offset (self ):
131
+ """ Offset in ``self._data`` at which to write next element """
132
+ if len (self ._offsets ) == 0 :
133
+ return 0
134
+ imax = np .argmax (self ._offsets )
135
+ return self ._offsets [imax ] + self ._lengths [imax ]
136
+
137
+ def append (self , element , cache_build = False ):
103
138
""" Appends `element` to this array sequence.
104
139
140
+ Append can be a lot faster if it knows that it is appending several
141
+ elements instead of a single element. In that case it can cache the
142
+ parameters it uses between append operations, in a "build cache". To
143
+ tell append to do this, use ``cache_build=True``. If you use
144
+ ``cache_build=True``, you need to finalize the append operations with
145
+ :method:`finalize_append`.
146
+
105
147
Parameters
106
148
----------
107
149
element : ndarray
108
150
Element to append. The shape must match already inserted elements
109
151
shape except for the first dimension.
152
+ cache_build : {False, True}
153
+ Whether to save the build cache from this append routine. If True,
154
+ append can assume it is the only player updating `self`, and the
155
+ caller must finalize `self` after all append operations, with
156
+ ``self.finalize_append()``.
110
157
111
158
Returns
112
159
-------
@@ -118,17 +165,56 @@ def append(self, element):
118
165
`ArraySequence.extend`.
119
166
"""
120
167
element = np .asarray (element )
168
+ if element .size == 0 :
169
+ return
170
+ el_shape = element .shape
171
+ n_items , common_shape = el_shape [0 ], el_shape [1 :]
172
+ build_cache = self ._build_cache
173
+ in_cached_build = build_cache is not None
174
+ if not in_cached_build : # One shot append, not part of sequence
175
+ build_cache = _BuildCache (self , common_shape , element .dtype )
176
+ next_offset = build_cache .next_offset
177
+ req_rows = next_offset + n_items
178
+ if self ._data .shape [0 ] < req_rows :
179
+ self ._resize_data_to (req_rows , build_cache )
180
+ self ._data [next_offset :req_rows ] = element
181
+ build_cache .offsets .append (next_offset )
182
+ build_cache .lengths .append (n_items )
183
+ build_cache .next_offset = req_rows
184
+ if in_cached_build :
185
+ return
186
+ if cache_build :
187
+ self ._build_cache = build_cache
188
+ else :
189
+ build_cache .update_seq (self )
121
190
122
- if self .common_shape != () and element .shape [1 :] != self .common_shape :
123
- msg = "All dimensions, except the first one, must match exactly"
124
- raise ValueError (msg )
191
+ def finalize_append (self ):
192
+ """ Finalize process of appending several elements to `self`
125
193
126
- next_offset = self ._data .shape [0 ]
127
- size = (self ._data .shape [0 ] + element .shape [0 ],) + element .shape [1 :]
128
- self ._data .resize (size )
129
- self ._data [next_offset :] = element
130
- self ._offsets = np .r_ [self ._offsets , next_offset ]
131
- self ._lengths = np .r_ [self ._lengths , element .shape [0 ]]
194
+ :method:`append` can be a lot faster if it knows that it is appending
195
+ several elements instead of a single element. To tell the append
196
+ method this is the case, use ``cache_build=True``. This method
197
+ finalizes the series of append operations after a call to
198
+ :method:`append` with ``cache_build=True``.
199
+ """
200
+ if self ._build_cache is None :
201
+ return
202
+ self ._build_cache .update_seq (self )
203
+ self ._build_cache = None
204
+
205
+ def _resize_data_to (self , n_rows , build_cache ):
206
+ """ Resize data array if required """
207
+ # Calculate new data shape, rounding up to nearest buffer size
208
+ n_bufs = np .ceil (n_rows / build_cache .rows_per_buf )
209
+ extended_n_rows = int (n_bufs * build_cache .rows_per_buf )
210
+ new_shape = (extended_n_rows ,) + build_cache .common_shape
211
+ if self ._data .size == 0 :
212
+ self ._data = np .empty (new_shape , dtype = build_cache .dtype )
213
+ else :
214
+ self ._data .resize (new_shape )
215
+
216
+ def shrink_data (self ):
217
+ self ._data .resize ((self ._get_next_offset (),) + self .common_shape )
132
218
133
219
def extend (self , elements ):
134
220
""" Appends all `elements` to this array sequence.
@@ -154,28 +240,16 @@ def extend(self, elements):
154
240
if not is_array_sequence (elements ):
155
241
self .extend (self .__class__ (elements ))
156
242
return
157
-
158
243
if len (elements ) == 0 :
159
244
return
160
-
161
- if (self .common_shape != () and
162
- elements .common_shape != self .common_shape ):
163
- msg = "All dimensions, except the first one, must match exactly"
164
- raise ValueError (msg )
165
-
166
- next_offset = self ._data .shape [0 ]
167
- self ._data .resize ((self ._data .shape [0 ] + sum (elements ._lengths ),
168
- elements ._data .shape [1 ]))
169
-
170
- offsets = []
171
- for offset , length in zip (elements ._offsets , elements ._lengths ):
172
- offsets .append (next_offset )
173
- chunk = elements ._data [offset :offset + length ]
174
- self ._data [next_offset :next_offset + length ] = chunk
175
- next_offset += length
176
-
177
- self ._lengths = np .r_ [self ._lengths , elements ._lengths ]
178
- self ._offsets = np .r_ [self ._offsets , offsets ]
245
+ self ._build_cache = _BuildCache (self ,
246
+ elements .common_shape ,
247
+ elements .data .dtype )
248
+ self ._resize_data_to (self ._get_next_offset () + elements .nb_elements ,
249
+ self ._build_cache )
250
+ for element in elements :
251
+ self .append (element )
252
+ self .finalize_append ()
179
253
180
254
def _extend_using_coroutine (self , buffer_size = 4 ):
181
255
""" Creates a coroutine allowing to append elements.
@@ -204,7 +278,7 @@ def _extend_using_coroutine(self, buffer_size=4):
204
278
offsets = []
205
279
lengths = []
206
280
207
- offset = 0 if len ( self ) == 0 else self . _offsets [ - 1 ] + self . _lengths [ - 1 ]
281
+ offset = self . _get_next_offset ()
208
282
try :
209
283
first_element = True
210
284
while True :
@@ -293,20 +367,24 @@ def __getitem__(self, idx):
293
367
start = self ._offsets [idx ]
294
368
return self ._data [start :start + self ._lengths [idx ]]
295
369
296
- elif isinstance (idx , (slice , list )) or is_ndarray_of_int_or_bool (idx ):
297
- seq = self .__class__ ()
370
+ seq = self .__class__ ()
371
+ seq ._is_view = True
372
+ if isinstance (idx , tuple ):
373
+ off_idx = idx [0 ]
374
+ seq ._data = self ._data .__getitem__ ((slice (None ),) + idx [1 :])
375
+ else :
376
+ off_idx = idx
298
377
seq ._data = self ._data
299
- seq ._offsets = self ._offsets [idx ]
300
- seq ._lengths = self ._lengths [idx ]
301
- seq ._is_view = True
378
+
379
+ if isinstance (off_idx , slice ): # Standard list slicing
380
+ seq ._offsets = self ._offsets [off_idx ]
381
+ seq ._lengths = self ._lengths [off_idx ]
302
382
return seq
303
383
304
- elif isinstance (idx , tuple ):
305
- seq = self .__class__ ()
306
- seq ._data = self ._data .__getitem__ ((slice (None ),) + idx [1 :])
307
- seq ._offsets = self ._offsets [idx [0 ]]
308
- seq ._lengths = self ._lengths [idx [0 ]]
309
- seq ._is_view = True
384
+ if isinstance (off_idx , list ) or is_ndarray_of_int_or_bool (off_idx ):
385
+ # Fancy indexing
386
+ seq ._offsets = self ._offsets [off_idx ]
387
+ seq ._lengths = self ._lengths [off_idx ]
310
388
return seq
311
389
312
390
raise TypeError ("Index must be either an int, a slice, a list of int"
0 commit comments