6
6
import json
7
7
import logging as _logging
8
8
import os .path
9
- import re
10
9
import typing
11
10
from collections import ChainMap , Counter
12
11
from pathlib import Path
@@ -119,57 +118,40 @@ def _open_input(p: PathOrIO) -> TextIO:
119
118
return io .StringIO (file_content )
120
119
121
120
122
- def _separate_metadata_and_table_from_stream (stream : TextIO ):
123
- stream .seek (0 )
124
-
125
- # Create a new StringIO object for filtered data
126
- table_component = io .StringIO ()
127
- metadata_component = io .StringIO ()
128
-
129
- header_section = True
130
-
131
- # Filter out lines starting with '#'
132
- for line in stream :
133
- if not line .startswith ("#" ):
134
- table_component .write (line )
135
- if header_section :
136
- header_section = False
137
- elif header_section :
138
- # We strip any trailing tabs. Such tabs may have been left
139
- # by a spreadsheet editor who treated the header lines as
140
- # if they were normal data lines; they would prevent the
141
- # YAML parser from correctly parsing the metadata block.
142
- metadata_component .write (line .rstrip ("\t \n " ) + "\n " )
143
- else :
144
- logging .info (
145
- f"Line { line } is starting with hash symbol, but header section is already passed. "
146
- f"This line is skipped"
147
- )
148
-
149
- # Reset the cursor to the start of the new StringIO object
150
- table_component .seek (0 )
151
- metadata_component .seek (0 )
152
- return table_component , metadata_component
153
-
154
-
155
- def _read_pandas_and_metadata (file_path : PathOrIO , sep : Optional [str ] = None ):
121
+ def _read_pandas_and_metadata (
122
+ file_path : Union [str , Path , TextIO ], sep : Optional [str ] = None
123
+ ) -> tuple [pd .DataFrame , MetadataType ]:
156
124
"""Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.
157
125
158
126
:param file_path: The file path or stream to read
159
127
:param sep: File separator for pandas
160
- :return: A pandas dataframe
128
+ :return: A pair of a dataframe and metadata dictionary
161
129
"""
162
130
if sep is None :
163
- sep = _infer_separator (file_path )
131
+ sep = _infer_separator (file_path ) or " \t "
164
132
165
133
if isinstance (file_path , (str , Path )):
166
134
raise_for_bad_path (file_path )
167
135
168
136
stream = _open_input (file_path )
169
- table_stream , metadata_stream = _separate_metadata_and_table_from_stream (stream )
137
+
138
+ # consume from the top of the stream until there's no more preceding #
139
+ header_yaml = ""
140
+ while (line := stream .readline ()).startswith ("#" ):
141
+ line = line .lstrip ("#" ).rstrip ()
142
+ if not line :
143
+ continue
144
+ header_yaml += line + "\n "
145
+
146
+ sssom_metadata = yaml .safe_load (header_yaml ) if header_yaml else {}
147
+
148
+ # The first line that doesn't start with a # is assumed
149
+ # to be the header, so we split it with the inferred separator
150
+ names = line .strip ().split (sep )
170
151
171
152
try :
172
- df = pd .read_csv (table_stream , sep = sep , dtype = str , engine = "python" )
153
+ # pandas can keep going and read from the same stream that we already have
154
+ df = pd .read_csv (stream , sep = sep , dtype = str , engine = "python" , header = None , names = names )
173
155
except EmptyDataError as e :
174
156
logging .warning (f"Seems like the dataframe is empty: { e } " )
175
157
df = pd .DataFrame (
@@ -184,7 +166,6 @@ def _read_pandas_and_metadata(file_path: PathOrIO, sep: Optional[str] = None):
184
166
else :
185
167
df .fillna ("" , inplace = True )
186
168
187
- sssom_metadata = _read_metadata_from_table (metadata_stream )
188
169
return df , sssom_metadata
189
170
190
171
@@ -895,21 +876,6 @@ def _swap_object_subject(mapping: Mapping) -> Mapping:
895
876
return mapping
896
877
897
878
898
- def _read_metadata_from_table (stream : io .StringIO ) -> Dict [str , Any ]:
899
- yamlstr = ""
900
- for line in stream :
901
- if line .startswith ("#" ):
902
- yamlstr += re .sub ("^#" , "" , line )
903
- else :
904
- break
905
-
906
- if yamlstr :
907
- meta = yaml .safe_load (yamlstr )
908
- logging .info (f"Meta={ meta } " )
909
- return meta
910
- return {}
911
-
912
-
913
879
def _set_metadata_in_mapping_set (
914
880
mapping_set : MappingSet , metadata : Optional [MetadataType ] = None , overwrite : bool = True
915
881
) -> None :
0 commit comments