RF: refactor tck read for speed

matthew-brett · matthew-brett · commit ea84c9dcb002 · 2017-03-27T10:50:18.000+01:00
Use byte operations to save array creation etc when separating
streamlines in TCK format.
diff --git a/nibabel/streamlines/tck.py b/nibabel/streamlines/tck.py
@@ -21,7 +21,6 @@
 from .header import Field
 
 MEGABYTE = 1024 * 1024
-BUFFER_SIZE = 1000000
 
 
 def create_empty_header():
@@ -342,8 +341,8 @@ def _read_header(fileobj):
 
         return hdr
 
-    @staticmethod
-    def _read(fileobj, header, buffer_size=4):
+    @classmethod
+    def _read(cls, fileobj, header, buffer_size=4):
         """ Return generator that reads TCK data from `fileobj` given `header`
 
         Parameters
@@ -369,65 +368,60 @@ def _read(fileobj, header, buffer_size=4):
         buffer_size = int(buffer_size * MEGABYTE)
         buffer_size += coordinate_size - (buffer_size % coordinate_size)
 
+        # Markers for streamline end and file end
+        fiber_marker = cls.FIBER_DELIMITER.astype(dtype).tostring()
+        eof_marker = cls.EOF_DELIMITER.astype(dtype).tostring()
+
         with Opener(fileobj) as f:
             start_position = f.tell()
 
             # Set the file position at the beginning of the data.
             f.seek(header["_offset_data"], os.SEEK_SET)
 
             eof = False
-            buff = b""
-            pts = []
-
-            i = 0
-
-            while not eof or not np.all(np.isinf(pts)):
-
-                if not eof:
-                    bytes_read = f.read(buffer_size)
-                    buff += bytes_read
-                    eof = len(bytes_read) == 0
+            buffs = []
+            n_streams = 0
 
-                # Read floats.
-                pts = np.frombuffer(buff, dtype=dtype)
+            while not eof:
 
-                # Convert data to little-endian if needed.
-                if dtype != '<f4':
-                    pts = pts.astype('<f4')
-
-                pts = pts.reshape([-1, 3])
-                idx_nan = np.arange(len(pts))[np.isnan(pts[:, 0])]
+                bytes_read = f.read(buffer_size)
+                buffs.append(bytes_read)
+                eof = len(bytes_read) != buffer_size
 
                 # Make sure we've read enough to find a streamline delimiter.
-                if len(idx_nan) == 0:
+                if fiber_marker not in bytes_read:
                     # If we've read the whole file, then fail.
-                    if eof and not np.all(np.isinf(pts)):
-                        msg = ("Cannot find a streamline delimiter. This file"
-                               " might be corrupted.")
-                        raise DataError(msg)
-
-                    # Otherwise read a bit more.
-                    continue
-
-                nb_pts_total = 0
-                idx_start = 0
-                for idx_end in idx_nan:
-                    nb_pts = len(pts[idx_start:idx_end, :])
-                    nb_pts_total += nb_pts
-
-                    if nb_pts > 0:
-                        yield pts[idx_start:idx_end, :]
-                        i += 1
-
-                    idx_start = idx_end + 1
-
-                # Remove pts plus the first triplet of NaN.
-                nb_tiplets_to_remove = nb_pts_total + len(idx_nan)
-                nb_bytes_to_remove = nb_tiplets_to_remove * 3 * dtype.itemsize
-                buff = buff[nb_bytes_to_remove:]
+                    if eof:
+                        # Could have minimal buffering, and have read only the
+                        # EOF delimiter
+                        buffs = [b''.join(buffs)]
+                        if not buffs[0] == eof_marker:
+                            raise DataError(
+                                "Cannot find a streamline delimiter. This file"
+                                " might be corrupted.")
+                    else:
+                        # Otherwise read a bit more.
+                        continue
+
+                all_parts = b''.join(buffs).split(fiber_marker)
+                point_parts, buffs = all_parts[:-1], all_parts[-1:]
+                point_parts = [p for p in point_parts if p != b'']
+
+                for point_part in point_parts:
+                    # Read floats.
+                    pts = np.frombuffer(point_part, dtype=dtype)
+                    # Enforce ability to write to underlying bytes object
+                    pts.flags.writeable = True
+                    # Convert data to little-endian if needed.
+                    yield pts.astype('<f4', copy=False).reshape([-1, 3])
+
+                n_streams += len(point_parts)
+
+            if not buffs[-1] == eof_marker:
+                raise DataError('Expecting end-of-file marker ' 'inf inf inf')
 
             # In case the 'count' field was not provided.
-            header[Field.NB_STREAMLINES] = i
+            header[Field.NB_STREAMLINES] = n_streams
 
             # Set the file position where it was (in case it was already open).
             f.seek(start_position, os.SEEK_CUR)
diff --git a/nibabel/streamlines/tests/test_tck.py b/nibabel/streamlines/tests/test_tck.py
@@ -6,16 +6,16 @@
 from nibabel.externals.six import BytesIO
 from nibabel.py3k import asbytes
 
-from nose.tools import assert_equal, assert_raises
-
-from nibabel.testing import data_path
-from .test_tractogram import assert_tractogram_equal
 from ..array_sequence import ArraySequence
 from ..tractogram import Tractogram
 from ..tractogram_file import DataError
 
 from ..tck import TckFile
 
+from nose.tools import assert_equal, assert_raises, assert_true
+from numpy.testing import assert_array_equal
+from nibabel.testing import data_path
+from .test_tractogram import assert_tractogram_equal
 
 DATA = {}
 
@@ -62,6 +62,17 @@ def test_load_simple_file(self):
         tck = TckFile(tractogram, header=hdr)
         assert_tractogram_equal(tck.tractogram, DATA['simple_tractogram'])
 
+    def test_writeable_data(self):
+        data = DATA['simple_tractogram']
+        for key in ('simple_tck_fname', 'simple_tck_big_endian_fname'):
+            for lazy_load in [False, True]:
+                tck = TckFile.load(DATA[key], lazy_load=lazy_load)
+                for actual, expected_tgi in zip(tck.streamlines, data):
+                    assert_array_equal(actual, expected_tgi.streamline)
+                    # Test we can write to arrays
+                    assert_true(actual.flags.writeable)
+                    actual[0, 0] = 99
+
     def test_load_simple_file_in_big_endian(self):
         for lazy_load in [False, True]:
             tck = TckFile.load(DATA['simple_tck_big_endian_fname'],