88given as a folder on local disk
99"""
1010
11- import contextlib
1211import mmap
1312import os
14- import platform
1513import time
1614from typing import Dict
1715from typing import List
3432from opteryx .utils .file_decoders import get_decoder
3533
3634OS_SEP = os .sep
37- IS_LINUX = platform .system () == "Linux"
38-
39-
40- # prefer MAP_PRIVATE and on Linux enable MAP_POPULATE to fault pages in
41- flags = mmap .MAP_PRIVATE
42- if IS_LINUX :
43- with contextlib .suppress (Exception ):
44- flags |= getattr (mmap , "MAP_POPULATE" , 0 )
45- mmap_config = {}
46- mmap_config ["flags" ] = flags
47- mmap_config ["prot" ] = mmap .PROT_READ
4835
4936
5037class DiskConnector (BaseConnector , Partitionable , PredicatePushable , LimitPushable , Statistics ):
@@ -157,7 +144,8 @@ def read_blob(
157144 return result
158145 finally :
159146 # CRITICAL: Clean up the memory mapping
160- unmap_memory (mmap_obj )
147+ pass
148+ # unmap_memory(mmap_obj)
161149
162150 @single_item_cache
163151 def get_list_of_blob_names (self , * , prefix : str ) -> List [str ]:
@@ -219,7 +207,7 @@ def read_dataset(
219207 decoder = get_decoder (blob_name )
220208 try :
221209 if not just_schema :
222- num_rows , _ , raw_bytes , decoded = self .read_blob (
210+ num_rows , _ , raw_size , decoded = self .read_blob (
223211 blob_name = blob_name ,
224212 decoder = decoder ,
225213 just_schema = False ,
@@ -234,8 +222,8 @@ def read_dataset(
234222
235223 self .statistics .rows_seen += num_rows
236224 self .rows_seen += num_rows
237- self .statistics .bytes_raw += raw_bytes
238225 self .blobs_seen += 1
226+ self .statistics .bytes_raw += raw_size
239227 yield decoded
240228
241229 # if we have read all the rows we need to stop
@@ -247,14 +235,9 @@ def read_dataset(
247235 decoder = decoder ,
248236 just_schema = True ,
249237 )
250- # Some decoders may return None for schema (e.g. unreadable
251- # or undecidable schema). Skip those and continue with the
252- # next blob instead of trying to access attributes on None.
253- if schema is None :
254- continue
255238 # if we have more than one blob we need to estimate the row count
256239 blob_count = len (blob_names )
257- if getattr ( schema , " row_count_metric" , None ) and blob_count > 1 :
240+ if schema . row_count_metric and blob_count > 1 :
258241 schema .row_count_estimate = schema .row_count_metric * blob_count
259242 schema .row_count_metric = None
260243 self .statistics .estimated_row_count += schema .row_count_estimate
0 commit comments