-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Add columns support to JSON loader for selective key filtering #7652
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 4 commits
db75657
c7872cb
a0fedf5
d23a48b
5d3cc12
eec7df9
5e93f70
608ed21
9fa38b4
d05759a
428444d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,6 +50,7 @@ class JsonConfig(datasets.BuilderConfig): | |
block_size: Optional[int] = None # deprecated | ||
chunksize: int = 10 << 20 # 10MB | ||
newlines_in_values: Optional[bool] = None | ||
columns: Optional[List[str]] = None | ||
|
||
def __post_init__(self): | ||
super().__post_init__() | ||
|
@@ -107,14 +108,20 @@ def _generate_tables(self, files): | |
if df.columns.tolist() == [0]: | ||
df.columns = list(self.config.features) if self.config.features else ["text"] | ||
pa_table = pa.Table.from_pandas(df, preserve_index=False) | ||
|
||
# Filter only selected columns if specified | ||
if self.config.columns is not None: | ||
missing_cols = [col for col in self.config.columns if col not in pa_table.column_names] | ||
for col in missing_cols: | ||
pa_table = pa_table.append_column(col, pa.array([None] * pa_table.num_rows)) | ||
pa_table = pa_table.select(self.config.columns) | ||
|
||
yield file_idx, self._cast_table(pa_table) | ||
|
||
# If the file has one json object per line | ||
else: | ||
with open(file, "rb") as f: | ||
batch_idx = 0 | ||
# Use block_size equal to the chunk size divided by 32 to leverage multithreading | ||
# Set a default minimum value of 16kB if the chunk size is really small | ||
Comment on lines
-130
to
-131
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. revert this comment deletion and the 2 others There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Wanted clarification on “the 2 others” to ensure no comment restorations were missed. Actually i have restored the two missing comments above - are they at the right place? :) |
||
block_size = max(self.config.chunksize // 32, 16 << 10) | ||
encoding_errors = ( | ||
self.config.encoding_errors if self.config.encoding_errors is not None else "strict" | ||
|
@@ -123,12 +130,10 @@ def _generate_tables(self, files): | |
batch = f.read(self.config.chunksize) | ||
if not batch: | ||
break | ||
# Finish current line | ||
try: | ||
batch += f.readline() | ||
except (AttributeError, io.UnsupportedOperation): | ||
batch += readline(f) | ||
# PyArrow only accepts utf-8 encoded bytes | ||
if self.config.encoding != "utf-8": | ||
batch = batch.decode(self.config.encoding, errors=encoding_errors).encode("utf-8") | ||
try: | ||
|
@@ -137,6 +142,12 @@ def _generate_tables(self, files): | |
pa_table = paj.read_json( | ||
io.BytesIO(batch), read_options=paj.ReadOptions(block_size=block_size) | ||
) | ||
if self.config.columns is not None: | ||
missing_cols = [col for col in self.config.columns if col not in pa_table.column_names] | ||
for col in missing_cols: | ||
pa_table = pa_table.append_column(col, pa.array([None] * pa_table.num_rows)) | ||
pa_table = pa_table.select(self.config.columns) | ||
yield (file_idx, batch_idx), self._cast_table(pa_table) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would keep this at the end, where you removed the |
||
break | ||
except (pa.ArrowInvalid, pa.ArrowNotImplementedError) as e: | ||
if ( | ||
|
@@ -146,8 +157,6 @@ def _generate_tables(self, files): | |
): | ||
raise | ||
else: | ||
# Increase the block size in case it was too small. | ||
# The block size will be reset for the next file. | ||
logger.debug( | ||
f"Batch of {len(batch)} bytes couldn't be parsed with block_size={block_size}. Retrying with block_size={block_size * 2}." | ||
) | ||
|
@@ -165,14 +174,18 @@ def _generate_tables(self, files): | |
df.columns = list(self.config.features) if self.config.features else ["text"] | ||
try: | ||
pa_table = pa.Table.from_pandas(df, preserve_index=False) | ||
if self.config.columns is not None: | ||
missing_cols = [col for col in self.config.columns if col not in pa_table.column_names] | ||
for col in missing_cols: | ||
pa_table = pa_table.append_column(col, pa.array([None] * pa_table.num_rows)) | ||
pa_table = pa_table.select(self.config.columns) | ||
yield (file_idx, batch_idx), self._cast_table(pa_table) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same |
||
except pa.ArrowInvalid as e: | ||
logger.error( | ||
f"Failed to convert pandas DataFrame to Arrow Table from file '{file}' with error {type(e)}: {e}" | ||
) | ||
raise ValueError( | ||
f"Failed to convert pandas DataFrame to Arrow Table from file {file}." | ||
) from None | ||
yield file_idx, self._cast_table(pa_table) | ||
break | ||
yield (file_idx, batch_idx), self._cast_table(pa_table) | ||
batch_idx += 1 |
Uh oh!
There was an error while loading. Please reload this page.