Skip to content

Commit dd864fd

Browse files
committed
Slight improvement to tokenizer
AN003335 had a new unique problem with some of the Additional data in the SSF. I changed the code so it can handle that situation.
1 parent a45ed17 commit dd864fd

File tree

2 files changed

+43
-17
lines changed

2 files changed

+43
-17
lines changed

docs/todo.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,5 @@ Add --limit or --ignore option to validate to filter out certain types of errors
88
Think about extending METABOLITES and EXTENDED blocks with an "Attributes" line like "Factors" in DATA block as a way to add more information about the columns themselves.
99
Hunter also wanted to consider adding things like the _factors properties into the JSON as well. For example, the _factors could be added into ['MS_METABOLITE_DATA'] under a 'Factors' key.
1010

11+
Think about adding an "UNASSIGNED" data block for the datasets we found that have a results_file instead of having the data in the mwTab file.
12+
Pretty sure most of these if not all are all unnassigned data where there are basically bins and no metabolite assinments.

src/mwtab/tokenizer.py

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -100,16 +100,27 @@ def tokenizer(text, dict_type = None):
100100
elif line.startswith("SUBJECT_SAMPLE_FACTORS"):
101101
line_items = line.split("\t")
102102

103-
factor_pairs = line_items[3].split(" | ")
104103
factor_dict = dict_type()
105-
for pair in factor_pairs:
106-
try:
107-
factor_key, factor_value = pair.split(":")
108-
except ValueError as e:
109-
raise ValueError("Expected exactly 1 ':' in the factor key value pair, '" + pair + "'") from e
110-
factor_key = factor_key.strip()
111-
factor_value = factor_value.strip()
112-
factor_dict[factor_key] = factor_value
104+
colon_split = line_items[3].split(":")
105+
factor_items = [item for factor_item in colon_split for item in factor_item.rsplit(' | ', 1)]
106+
element_indexes_without_bar = [i+1 for i, value in enumerate(colon_split[1:-1]) if ' | ' not in value]
107+
if element_indexes_without_bar:
108+
index = element_indexes_without_bar[0]
109+
factor_item_start = colon_split[index-1]
110+
if ' | ' in factor_item_start:
111+
factor_item_start = factor_item_start.split(' | ')[1]
112+
factor_item_end = colon_split[index+1]
113+
if ' | ' in factor_item_end:
114+
factor_item_end = factor_item_end.split(' | ')[0]
115+
factor_item = ':'.join([factor_item_start,
116+
colon_split[index],
117+
factor_item_end])
118+
message = ("Either a bar (' | ') separating 2 items is missing or there is an extra colon (':') "
119+
"in the factor key value pair, '" + factor_item + "'")
120+
raise ValueError(message)
121+
122+
for key, value in zip(factor_items[0::2], factor_items[1::2]):
123+
factor_dict[key.strip()] = value.strip()
113124

114125
subject_sample_factors_dict = {
115126
"Subject ID": line_items[1],
@@ -118,14 +129,27 @@ def tokenizer(text, dict_type = None):
118129
}
119130
if line_items[4]:
120131
additional_data = dict_type()
121-
for factor_item in line_items[4].split("; "):
122-
try:
123-
key, value = factor_item.split("=")
124-
except ValueError as e:
125-
raise ValueError("Expected exactly 1 '=' in the additional data key value pair, '" + factor_item + "'") from e
126-
key = key.strip()
127-
value = value.strip()
128-
additional_data[key] = value
132+
equal_split = line_items[4].split("=")
133+
add_items = [item for add_item in equal_split for item in add_item.rsplit('; ', 1)]
134+
element_indexes_without_semicolon = [i+1 for i, value in enumerate(equal_split[1:-1]) if '; ' not in value]
135+
if element_indexes_without_semicolon:
136+
index = element_indexes_without_semicolon[0]
137+
add_item_start = equal_split[index-1]
138+
if '; ' in add_item_start:
139+
add_item_start = add_item_start.split('; ')[1]
140+
add_item_end = equal_split[index+1]
141+
if '; ' in add_item_end:
142+
add_item_end = add_item_end.split('; ')[0]
143+
add_item = '='.join([add_item_start,
144+
equal_split[index],
145+
add_item_end])
146+
message = ("Either a semicolon ('; ') separating 2 items is missing or there is an extra equal sign ('=') "
147+
"in the additional data key value pair, '" + add_item + "'")
148+
raise ValueError(message)
149+
150+
for key, value in zip(add_items[0::2], add_items[1::2]):
151+
additional_data[key.strip()] = value.strip()
152+
129153
subject_sample_factors_dict["Additional sample data"] = additional_data
130154
yield KeyValue(line_items[0].strip(), subject_sample_factors_dict)
131155

0 commit comments

Comments
 (0)