|  | 
| 9 | 9 | """ | 
| 10 | 10 | 
 | 
| 11 | 11 | import contextlib | 
| 12 |  | -import ctypes | 
| 13 | 12 | import mmap | 
| 14 | 13 | import os | 
| 15 | 14 | import platform | 
|  | 
| 31 | 30 | from opteryx.exceptions import DatasetNotFoundError | 
| 32 | 31 | from opteryx.exceptions import EmptyDatasetError | 
| 33 | 32 | from opteryx.exceptions import UnsupportedFileTypeError | 
| 34 |  | -from opteryx.utils import is_windows | 
| 35 | 33 | from opteryx.utils.file_decoders import TUPLE_OF_VALID_EXTENSIONS | 
| 36 | 34 | from opteryx.utils.file_decoders import get_decoder | 
| 37 | 35 | 
 | 
| 38 | 36 | OS_SEP = os.sep | 
| 39 |  | -IS_WINDOWS = is_windows() | 
| 40 | 37 | IS_LINUX = platform.system() == "Linux" | 
| 41 | 38 | 
 | 
| 42 |  | -# Define os.O_BINARY for non-Windows platforms if it's not already defined | 
| 43 |  | -if not hasattr(os, "O_BINARY"): | 
| 44 |  | -    os.O_BINARY = 0  # Value has no effect on non-Windows platforms | 
| 45 |  | -if not hasattr(os, "O_DIRECT"): | 
| 46 |  | -    os.O_DIRECT = 0  # Value has no effect on non-Windows platforms | 
| 47 | 39 | 
 | 
|  | 40 | +# prefer MAP_PRIVATE and on Linux enable MAP_POPULATE to fault pages in | 
|  | 41 | +flags = mmap.MAP_PRIVATE | 
|  | 42 | +if IS_LINUX: | 
|  | 43 | +    with contextlib.suppress(Exception): | 
|  | 44 | +        flags |= getattr(mmap, "MAP_POPULATE", 0) | 
| 48 | 45 | mmap_config = {} | 
| 49 |  | -if not IS_WINDOWS: | 
| 50 |  | -    # prefer MAP_PRIVATE and on Linux enable MAP_POPULATE to fault pages in | 
| 51 |  | -    flags = mmap.MAP_PRIVATE | 
| 52 |  | -    if IS_LINUX and hasattr(mmap, "MAP_POPULATE"): | 
| 53 |  | -        with contextlib.suppress(Exception): | 
| 54 |  | -            flags |= mmap.MAP_POPULATE | 
| 55 |  | -    mmap_config["flags"] = flags | 
| 56 |  | -    mmap_config["prot"] = mmap.PROT_READ | 
| 57 |  | -else: | 
| 58 |  | -    mmap_config["access"] = mmap.ACCESS_READ | 
|  | 46 | +mmap_config["flags"] = flags | 
|  | 47 | +mmap_config["prot"] = mmap.PROT_READ | 
| 59 | 48 | 
 | 
| 60 | 49 | 
 | 
| 61 | 50 | class DiskConnector(BaseConnector, Partitionable, PredicatePushable, LimitPushable, Statistics): | 
| @@ -138,31 +127,73 @@ def read_blob( | 
| 138 | 127 |             OSError: | 
| 139 | 128 |                 If an I/O error occurs while reading the file. | 
| 140 | 129 |         """ | 
| 141 |  | -        try: | 
| 142 |  | -            file_descriptor = os.open(blob_name, os.O_RDONLY | os.O_BINARY) | 
| 143 |  | -            if hasattr(os, "posix_fadvise"): | 
| 144 |  | -                os.posix_fadvise(file_descriptor, 0, 0, os.POSIX_FADV_WILLNEED) | 
| 145 |  | -            size = os.fstat(file_descriptor).st_size | 
| 146 |  | -            _map = mmap.mmap(file_descriptor, length=size, **mmap_config) | 
| 147 |  | -            result = decoder( | 
| 148 |  | -                _map, | 
| 149 |  | -                just_schema=just_schema, | 
| 150 |  | -                projection=projection, | 
| 151 |  | -                selection=selection, | 
| 152 |  | -                use_threads=True, | 
| 153 |  | -            ) | 
| 154 |  | -            self.statistics.bytes_read += size | 
|  | 130 | +        # Hybrid strategy: choose mmap or read+memoryview depending on OS | 
|  | 131 | +        # macOS -> mmap, Linux -> read. | 
|  | 132 | + | 
|  | 133 | +        # helper to use mmap path | 
|  | 134 | +        def _use_mmap(): | 
|  | 135 | +            fd = os.open(blob_name, os.O_RDONLY) | 
|  | 136 | +            try: | 
|  | 137 | +                if hasattr(os, "posix_fadvise"): | 
|  | 138 | +                    with contextlib.suppress(Exception): | 
|  | 139 | +                        os.posix_fadvise(fd, 0, 0, os.POSIX_FADV_WILLNEED) | 
|  | 140 | +                size = os.fstat(fd).st_size | 
|  | 141 | +                _map = mmap.mmap(fd, length=size, **mmap_config) | 
|  | 142 | +                result = decoder( | 
|  | 143 | +                    _map, | 
|  | 144 | +                    just_schema=just_schema, | 
|  | 145 | +                    projection=projection, | 
|  | 146 | +                    selection=selection, | 
|  | 147 | +                    use_threads=True, | 
|  | 148 | +                ) | 
|  | 149 | + | 
|  | 150 | +                self.statistics.bytes_read += size | 
| 155 | 151 | 
 | 
| 156 |  | -            if not just_schema: | 
| 157 |  | -                stats = self.read_blob_statistics( | 
| 158 |  | -                    blob_name=blob_name, blob_bytes=_map, decoder=decoder | 
|  | 152 | +                if not just_schema: | 
|  | 153 | +                    stats = self.read_blob_statistics( | 
|  | 154 | +                        blob_name=blob_name, blob_bytes=_map, decoder=decoder | 
|  | 155 | +                    ) | 
|  | 156 | +                    if self.relation_statistics is None: | 
|  | 157 | +                        self.relation_statistics = stats | 
|  | 158 | + | 
|  | 159 | +                return result | 
|  | 160 | +            finally: | 
|  | 161 | +                os.close(fd) | 
|  | 162 | + | 
|  | 163 | +        # helper to use read()+memoryview path | 
|  | 164 | +        def _use_read(): | 
|  | 165 | +            with open(blob_name, "rb") as f: | 
|  | 166 | +                if hasattr(os, "posix_fadvise"): | 
|  | 167 | +                    with contextlib.suppress(Exception): | 
|  | 168 | +                        os.posix_fadvise(f.fileno(), 0, 0, os.POSIX_FADV_WILLNEED) | 
|  | 169 | + | 
|  | 170 | +                data = f.read() | 
|  | 171 | +                size = len(data) | 
|  | 172 | +                buf = memoryview(data) | 
|  | 173 | + | 
|  | 174 | +                result = decoder( | 
|  | 175 | +                    buf, | 
|  | 176 | +                    just_schema=just_schema, | 
|  | 177 | +                    projection=projection, | 
|  | 178 | +                    selection=selection, | 
|  | 179 | +                    use_threads=True, | 
| 159 | 180 |                 ) | 
| 160 |  | -                if self.relation_statistics is None: | 
| 161 |  | -                    self.relation_statistics = stats | 
| 162 | 181 | 
 | 
| 163 |  | -            return result | 
| 164 |  | -        finally: | 
| 165 |  | -            os.close(file_descriptor) | 
|  | 182 | +                self.statistics.bytes_read += size | 
|  | 183 | + | 
|  | 184 | +                if not just_schema: | 
|  | 185 | +                    stats = self.read_blob_statistics( | 
|  | 186 | +                        blob_name=blob_name, blob_bytes=buf, decoder=decoder | 
|  | 187 | +                    ) | 
|  | 188 | +                    if self.relation_statistics is None: | 
|  | 189 | +                        self.relation_statistics = stats | 
|  | 190 | + | 
|  | 191 | +                return result | 
|  | 192 | + | 
|  | 193 | +        # macOS: use mmap; Linux: prefer read (observed faster on some Linux setups) | 
|  | 194 | +        if platform.system() == "Darwin": | 
|  | 195 | +            return _use_mmap() | 
|  | 196 | +        return _use_read() | 
| 166 | 197 | 
 | 
| 167 | 198 |     @single_item_cache | 
| 168 | 199 |     def get_list_of_blob_names(self, *, prefix: str) -> List[str]: | 
|  | 
0 commit comments