Skip to content

Commit 08abed2

Browse files
authored
Simplify reading CSV/TSV (#591)
* Simplify reading CSV * Update parsers.py * Update parsers.py * Update parsers.py * Update parsers.py * Update parsers.py * Update parsers.py * Update parsers.py * Update parsers.py * Improve docs * Fix bug in test
1 parent 144b388 commit 08abed2

File tree

2 files changed

+26
-57
lines changed

2 files changed

+26
-57
lines changed

src/sssom/parsers.py

Lines changed: 21 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import json
77
import logging as _logging
88
import os.path
9-
import re
109
import typing
1110
from collections import ChainMap, Counter
1211
from pathlib import Path
@@ -119,57 +118,40 @@ def _open_input(p: PathOrIO) -> TextIO:
119118
return io.StringIO(file_content)
120119

121120

122-
def _separate_metadata_and_table_from_stream(stream: TextIO):
123-
stream.seek(0)
124-
125-
# Create a new StringIO object for filtered data
126-
table_component = io.StringIO()
127-
metadata_component = io.StringIO()
128-
129-
header_section = True
130-
131-
# Filter out lines starting with '#'
132-
for line in stream:
133-
if not line.startswith("#"):
134-
table_component.write(line)
135-
if header_section:
136-
header_section = False
137-
elif header_section:
138-
# We strip any trailing tabs. Such tabs may have been left
139-
# by a spreadsheet editor who treated the header lines as
140-
# if they were normal data lines; they would prevent the
141-
# YAML parser from correctly parsing the metadata block.
142-
metadata_component.write(line.rstrip("\t\n") + "\n")
143-
else:
144-
logging.info(
145-
f"Line {line} is starting with hash symbol, but header section is already passed. "
146-
f"This line is skipped"
147-
)
148-
149-
# Reset the cursor to the start of the new StringIO object
150-
table_component.seek(0)
151-
metadata_component.seek(0)
152-
return table_component, metadata_component
153-
154-
155-
def _read_pandas_and_metadata(file_path: PathOrIO, sep: Optional[str] = None):
121+
def _read_pandas_and_metadata(
122+
file_path: Union[str, Path, TextIO], sep: Optional[str] = None
123+
) -> tuple[pd.DataFrame, MetadataType]:
156124
"""Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.
157125
158126
:param file_path: The file path or stream to read
159127
:param sep: File separator for pandas
160-
:return: A pandas dataframe
128+
:return: A pair of a dataframe and metadata dictionary
161129
"""
162130
if sep is None:
163-
sep = _infer_separator(file_path)
131+
sep = _infer_separator(file_path) or "\t"
164132

165133
if isinstance(file_path, (str, Path)):
166134
raise_for_bad_path(file_path)
167135

168136
stream = _open_input(file_path)
169-
table_stream, metadata_stream = _separate_metadata_and_table_from_stream(stream)
137+
138+
# consume from the top of the stream until there's no more preceding #
139+
header_yaml = ""
140+
while (line := stream.readline()).startswith("#"):
141+
line = line.lstrip("#").rstrip()
142+
if not line:
143+
continue
144+
header_yaml += line + "\n"
145+
146+
sssom_metadata = yaml.safe_load(header_yaml) if header_yaml else {}
147+
148+
# The first line that doesn't start with a # is assumed
149+
# to be the header, so we split it with the inferred separator
150+
names = line.strip().split(sep)
170151

171152
try:
172-
df = pd.read_csv(table_stream, sep=sep, dtype=str, engine="python")
153+
# pandas can keep going and read from the same stream that we already have
154+
df = pd.read_csv(stream, sep=sep, dtype=str, engine="python", header=None, names=names)
173155
except EmptyDataError as e:
174156
logging.warning(f"Seems like the dataframe is empty: {e}")
175157
df = pd.DataFrame(
@@ -184,7 +166,6 @@ def _read_pandas_and_metadata(file_path: PathOrIO, sep: Optional[str] = None):
184166
else:
185167
df.fillna("", inplace=True)
186168

187-
sssom_metadata = _read_metadata_from_table(metadata_stream)
188169
return df, sssom_metadata
189170

190171

@@ -895,21 +876,6 @@ def _swap_object_subject(mapping: Mapping) -> Mapping:
895876
return mapping
896877

897878

898-
def _read_metadata_from_table(stream: io.StringIO) -> Dict[str, Any]:
899-
yamlstr = ""
900-
for line in stream:
901-
if line.startswith("#"):
902-
yamlstr += re.sub("^#", "", line)
903-
else:
904-
break
905-
906-
if yamlstr:
907-
meta = yaml.safe_load(yamlstr)
908-
logging.info(f"Meta={meta}")
909-
return meta
910-
return {}
911-
912-
913879
def _set_metadata_in_mapping_set(
914880
mapping_set: MappingSet, metadata: Optional[MetadataType] = None, overwrite: bool = True
915881
) -> None:

tests/test_parsers.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -470,8 +470,11 @@ def test_strict_parsing(self):
470470
parse_sssom_table(stream, strict=True)
471471

472472
# Make sure it parses in non-strict mode
473-
msdf = parse_sssom_table(stream)
474-
self.assertEqual(len(msdf.df), 2)
473+
with open(input_path, "r") as file:
474+
input_string = file.read()
475+
stream2 = io.StringIO(input_string)
476+
msdf = parse_sssom_table(stream2, strict=False)
477+
self.assertEqual(2, len(msdf.df))
475478

476479
def test_check_irregular_metadata(self):
477480
"""Test if irregular metadata check works according to https://w3id.org/sssom/spec."""

0 commit comments

Comments
 (0)