Merge pull request #22 from ramonaoptics/multi_threaded2

hmaarrfk · web-flow · commit ee75de35f18b · 2025-12-29T21:19:15.000-05:00
Optmize for multi-threaded by releasing the GIL
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -24,6 +24,15 @@ jobs:
           - os: "macos-latest"
             installer-url: "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-x86_64.sh"
             python-version: "3.13"
+          - os: "macos-14"
+            installer-url: "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh"
+            python-version: "3.12"
+          - os: "macos-14"
+            installer-url: "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh"
+            python-version: "3.13"
+          - os: "macos-14"
+            installer-url: "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh"
+            python-version: "3.14"
           - os: "windows-latest"
             installer-url: "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Windows-x86_64.exe"
             python-version: "3.12"
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.5.0] - 2025-12-29
+
+- Optimize image reading and writing for multi-threaded workloads by releasing the
+  GIL for entire operations instead of per-line. All tight loops (component and line
+  iterations) are now executed in C++ with the GIL released, significantly improving
+  performance in multi-threaded scenarios. Single-threaded performance is also improved
+  due to reduced Python overhead and better cache locality.
+- Remove temporary buffer allocations during image reading by writing directly to
+  the output array with clipping and dtype conversion handled in C++.
+
 ## [0.4.6] - 2025-12-29
 
 - Fix reading from memory files when the offset parameter is provided.
diff --git a/ojph/_imread.py b/ojph/_imread.py
@@ -1,5 +1,4 @@
 import numpy as np
-import ctypes
 from warnings import warn
 
 from .ojph_bindings import J2CInfile, MemInfile, Codestream
@@ -302,44 +301,11 @@ def read_image(
             min_val = iinfo.min
             max_val = iinfo.max
 
-        if self._num_components == 1:
-            # Single component - always HW format
-            for h in range(height):
-                self._codestream_pull(0, out=image[h], min_val=min_val, max_val=max_val)
-        elif self._channel_order == 'CHW':
-            # Non-RGB multi-component - use planar flag for format detection
-            for c in range(self._num_components):
-                for h in range(height):
-                    self._codestream_pull(c, out=image[c, h, :], min_val=min_val, max_val=max_val)
-        else:
-            # Non-planar mode was used for writing - return HWC format
-            for c in range(self._num_components):
-                for h in range(height):
-                    self._codestream_pull(c, out=image[h, :, c], min_val=min_val, max_val=max_val)
+        self._codestream.pull_all_components(image, self._num_components, self._channel_order, min_val, max_val)
 
         self._close_codestream_and_file()
         return image
 
-    def _codestream_pull(self, component, out, min_val=None, max_val=None):
-        line = self._codestream.pull(component)
-        i32_ptr = ctypes.cast(line.i32_address, ctypes.POINTER(ctypes.c_int32))
-        line_array = np.ctypeslib.as_array(
-            ctypes.cast(i32_ptr, ctypes.POINTER(ctypes.c_int32)),
-            shape=(line.size,)
-        )
-
-        # Aassume min_val is not None if max_val is not None
-        if max_val is not None:
-            line_array = np.clip(
-                line_array,
-                min_val,
-                max_val,
-                out=out,
-                casting='unsafe',
-            )
-        else:
-            out[:] = line_array
-
     def _close_codestream_and_file(self):
         if self._codestream is not None:
             self._codestream.close()
diff --git a/ojph/_imwrite.py b/ojph/_imwrite.py
@@ -1,5 +1,4 @@
 import numpy as np
-import ctypes
 import inspect
 from collections.abc import Buffer
 
@@ -39,8 +38,8 @@ def imwrite_to_memory(
     qstep=None,
     progression_order=None,
     tlm_marker=True,
-    tileparts_at_resolutions=True,
-    tileparts_at_components=False,
+    tileparts_at_resolutions=None,
+    tileparts_at_components=None,
 ):
     mem_outfile = MemOutfile()
     mem_outfile.open(65536, False)
@@ -72,8 +71,8 @@ def imwrite(
     qstep=None,
     progression_order=None,
     tlm_marker=True,
-    tileparts_at_resolutions=True,
-    tileparts_at_components=False,
+    tileparts_at_resolutions=None,
+    tileparts_at_components=None,
 ):
     # Auto-detect channel order if not provided
     if channel_order is None:
@@ -149,46 +148,16 @@ def imwrite(
     if not reversible and qstep is not None:
         codestream.access_qcd().set_irrev_quant(qstep)
     codestream.set_planar(num_components > 1)
-    # Set tile parts for resolution, but not for channels
-    codestream.set_tilepart_divisions(True, False)
+    if tileparts_at_resolutions is None:
+        tileparts_at_resolutions = progression_order == "RLCP"
+    if tileparts_at_components is None:
+        tileparts_at_components = False
+    codestream.set_tilepart_divisions(tileparts_at_resolutions, tileparts_at_components)
     codestream.request_tlm_marker(tlm_marker)
 
     codestream.write_headers(ojph_file, None, 0)
 
-    line = codestream.exchange(None, 0)
-    if channel_order == "HW":
-        # Single component - simple case
-        for i in range(height):
-            i32_ptr = ctypes.cast(line.i32_address, ctypes.POINTER(ctypes.c_uint32))
-            line_array = np.ctypeslib.as_array(
-                ctypes.cast(i32_ptr, ctypes.POINTER(ctypes.c_uint32)),
-                shape=(line.size,)
-            )
-            line_array[...] = image[i, :]
-            line = codestream.exchange(line, 0)
-    elif channel_order == 'HWC':
-        # Multi-component - use planar mode for efficiency
-        # HWC format: image[height, width, channel]
-        for c in range(num_components):
-            for i in range(height):
-                i32_ptr = ctypes.cast(line.i32_address, ctypes.POINTER(ctypes.c_uint32))
-                line_array = np.ctypeslib.as_array(
-                    ctypes.cast(i32_ptr, ctypes.POINTER(ctypes.c_uint32)),
-                    shape=(line.size,)
-                )
-                line_array[...] = image[i, :, c]
-                line = codestream.exchange(line, c)
-    elif channel_order == 'CHW':
-        # CHW format: image[channel, height, width]
-        for c in range(num_components):
-            for i in range(height):
-                i32_ptr = ctypes.cast(line.i32_address, ctypes.POINTER(ctypes.c_uint32))
-                line_array = np.ctypeslib.as_array(
-                    ctypes.cast(i32_ptr, ctypes.POINTER(ctypes.c_uint32)),
-                    shape=(line.size,)
-                )
-                line_array[...] = image[c, i, :]
-                line = codestream.exchange(line, c)
+    codestream.push_all_components(image, num_components, channel_order)
 
     codestream.flush()
     if close_codestream:
diff --git a/ojph/ojph_bindings.cpp b/ojph/ojph_bindings.cpp