Skip to content

Commit ac04165

Browse files
committed
Update reading inputs and add additional tests for summary
1 parent 9e6ba79 commit ac04165

File tree

7 files changed

+452
-15
lines changed

7 files changed

+452
-15
lines changed

beams/annotation.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,8 @@ def annotate_compounds(peaklist, lib_adducts, ppm, db_out, db_name, db_in=""):
683683
cursor_local.executescript(db_dump.read().decode('utf-8'))
684684
conn_local.commit()
685685

686+
cursor_local.execute("CREATE INDEX idx_exact_mass ON {} (exact_mass)".format(db_name.replace(".sql.gz", "")))
687+
686688
cursor_local.execute("SELECT name FROM sqlite_master WHERE type='table'")
687689
if (db_name.replace(".sql.gz", ""), ) not in cursor_local.fetchall():
688690
raise ValueError("Database {} not available".format(db_name))
@@ -882,7 +884,7 @@ def annotate_drug_products(peaklist, db_out, list_smiles, lib_adducts, ppm, phas
882884
comp = pyteomics_mass.Composition(mf)
883885
record.update(comp)
884886
record["molecular_formula"] = composition_to_string(comp)
885-
record["exact_mass"] = round(pyteomics_mass.calculate_mass(formula=str(mf), mass_data=nist_db), 6)
887+
record["exact_mass"] = round(pyteomics_mass.calculate_mass(formula=str(mf), mass_data=nist_database), 6)
886888
record["CHNOPS"] = sum([comp[e] for e in comp if e in ["C", "H", "N", "O", "P", "S"]]) == sum(list(comp.values()))
887889
records.append(record)
888890

@@ -919,7 +921,7 @@ def summary(df, db, single_row=False, single_column=False, convert_rt=None, ndig
919921
cursor = conn.cursor()
920922

921923
cursor.execute("DROP TABLE IF EXISTS peaklist")
922-
df[['name', 'mz', 'rt', "intensity"]].sort_values(by=["rt", "mz"]).to_sql('peaklist', conn, index=False)
924+
df[["name", "mz", "rt", "intensity"]].to_sql("peaklist", conn, index=False)
923925

924926
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
925927
tables = cursor.fetchall()
@@ -1155,7 +1157,7 @@ def summary(df, db, single_row=False, single_column=False, convert_rt=None, ndig
11551157
join_peak_labels = ""
11561158

11571159
query = """CREATE TABLE summary AS SELECT
1158-
peaklist.name, peaklist.mz, peaklist.rt{}{}
1160+
peaklist.name, peaklist.mz, peaklist.rt, peaklist.intensity{}{}
11591161
FROM peaklist
11601162
{}
11611163
{}
@@ -1212,7 +1214,7 @@ def summary(df, db, single_row=False, single_column=False, convert_rt=None, ndig
12121214
group_concat(round(ppm_error, 2), '||') as ppm_error
12131215
""")
12141216

1215-
query = """SELECT DISTINCT name, mz, rt, {}
1217+
query = """SELECT DISTINCT name, mz, rt, intensity, {}
12161218
from summary
12171219
GROUP BY NAME
12181220
ORDER BY rowid

beams/in_out.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,9 @@ def read_xset_matrix(fn_matrix, first_sample, separator="\t", mapping={"mz": "mz
154154
return pd.concat([df_peaklist, df_matrix], axis=1)
155155

156156

157-
def combine_peaklist_matrix(fn_peaklist, fn_matrix, separator="\t", mapping={"name": "name", "mz": "mz", "rt": "rt"}, merge_on="name", samples_in_columns=True):
157+
def combine_peaklist_matrix(fn_peaklist, fn_matrix, separator="\t", median_intensity=True,
158+
mapping={"name": "name", "mz": "mz", "rt": "rt", "intensity": "intensity"},
159+
merge_on="name", samples_in_columns=True):
158160
if "mz" not in mapping and "rt" not in mapping and "name" not in mapping:
159161
raise ValueError("Incorrect column mapping: provide column names for mz, and name")
160162

@@ -176,7 +178,12 @@ def combine_peaklist_matrix(fn_peaklist, fn_matrix, separator="\t", mapping={"na
176178
df_peaklist.columns = ["name", "mz", "rt"]
177179

178180
df_matrix = df_matrix.rename(columns={mapping["name"]: 'name'})
179-
df_peaklist["intensity"] = pd.Series(df_matrix.median(axis=1, skipna=True), index=df_matrix.index)
181+
182+
if mapping["intensity"] not in df_peaklist.columns:
183+
if median_intensity:
184+
df_peaklist["intensity"] = pd.Series(df_matrix.median(axis=1, skipna=True), index=df_matrix.index)
185+
else:
186+
df_peaklist["intensity"] = pd.Series(df_matrix.mean(axis=1, skipna=True), index=df_matrix.index)
180187

181188
if len(df_peaklist[mapping["name"]].unique()) != len(df_peaklist[mapping["name"]]):
182189
raise ValueError("Peaklist: Values column '{}' are not unique".format(mapping["name"]))
@@ -187,7 +194,8 @@ def combine_peaklist_matrix(fn_peaklist, fn_matrix, separator="\t", mapping={"na
187194

188195

189196

190-
def read_peaklist(fn_peaklist, separator="\t", mapping={"name": "name", "mz": "mz", "rt": "rt", "intensity": "intensity"}):
197+
def read_peaklist(fn_peaklist, separator="\t",
198+
mapping={"name": "name", "mz": "mz", "rt": "rt", "intensity": "intensity"}):
191199

192200
df_peaklist = pd.read_csv(fn_peaklist, header=0, sep=separator, dtype={"name": str}, float_precision="round_trip")
193201
if mapping["mz"] not in df_peaklist.columns.values or mapping["intensity"] not in df_peaklist.columns.values:
@@ -200,9 +208,12 @@ def read_peaklist(fn_peaklist, separator="\t", mapping={"name": "name", "mz": "m
200208
df_peaklist.columns = ["mz", "intensity"]
201209
df_peaklist.insert(0, "name", [str(x).replace(".","_") for x in df_peaklist[mapping["mz"]]])
202210
df_peaklist["mz"] = df_peaklist["mz"].astype(float)
211+
df_peaklist["intensity"] = df_peaklist["intensity"].astype(float)
203212
else:
204213
df_peaklist = df_peaklist[[mapping["name"], mapping["mz"], mapping["intensity"]]]
205214
df_peaklist.columns = ["name", "mz", "intensity"]
215+
df_peaklist["mz"] = df_peaklist["mz"].astype(float)
216+
df_peaklist["intensity"] = df_peaklist["intensity"].astype(float)
206217
df_peaklist.insert(2, "rt", 0.0)
207218
elif "rt" in mapping:
208219
if mapping["name"] in df_peaklist.columns.values:
@@ -223,4 +234,8 @@ def read_peaklist(fn_peaklist, separator="\t", mapping={"name": "name", "mz": "m
223234
df_peaklist = df_peaklist[[mapping["name"], mapping["mz"], mapping["rt"], mapping["intensity"]]]
224235
df_peaklist.columns = ["name", "mz", "rt", "intensity"]
225236

237+
df_peaklist["mz"] = df_peaklist["mz"].astype(float)
238+
df_peaklist["rt"] = df_peaklist["rt"].astype(float)
239+
df_peaklist["intensity"] = df_peaklist["intensity"].astype(float)
240+
226241
return df_peaklist

tests/test_annotation.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -116,15 +116,30 @@ def test_annotate_molecular_formulae(self):
116116
self.assertEqual(sqlite_count(to_test_results(self.db_results), "molecular_formulae"), 16)
117117

118118
def test_summary(self):
119+
120+
def _assert(summary_test_data, summary_result):
121+
with open(summary_result) as result:
122+
with open(summary_test_data) as test_data:
123+
lines_results = result.read().splitlines()
124+
lines_test_data = test_data.read().splitlines()
125+
for i in range(len(lines_results)):
126+
self.assertTrue(lines_results[i], lines_test_data[i])
127+
self.assertEqual(sqlite_records(to_test_results(self.db_results), "summary"), sqlite_records(to_test_data(self.db_results), "summary"))
128+
129+
fn_summary = "summary_mr_mc.txt"
119130
df_summary = summary(self.df, to_test_results(self.db_results), single_row=False, single_column=False, convert_rt=None, ndigits_mz=None)
120-
df_summary.to_csv(to_test_results("summary.txt"), sep="\t", index=False)
121-
with open(to_test_results("summary.txt")) as result:
122-
with open(to_test_data("summary.txt")) as test_data:
123-
lines_results = result.read().splitlines()
124-
lines_test_data = test_data.read().splitlines()
125-
for i in range(len(lines_results)):
126-
self.assertTrue(lines_results[i], lines_test_data[i])
127-
self.assertEqual(sqlite_records(to_test_results(self.db_results), "summary"), sqlite_records(to_test_data(self.db_results), "summary"))
131+
df_summary.to_csv(to_test_results(fn_summary), sep="\t", index=False)
132+
_assert(to_test_data(fn_summary), to_test_results(fn_summary))
133+
134+
fn_summary = "summary_sr_mc.txt"
135+
df_summary = summary(self.df, to_test_results(self.db_results), single_row=True, single_column=False, convert_rt=None, ndigits_mz=None)
136+
df_summary.to_csv(to_test_results(fn_summary), sep="\t", index=False)
137+
_assert(to_test_data(fn_summary), to_test_results(fn_summary))
138+
139+
fn_summary = "summary_sr_sc.txt"
140+
df_summary = summary(self.df, to_test_results(self.db_results), single_row=True, single_column=True, convert_rt=None, ndigits_mz=None)
141+
df_summary.to_csv(to_test_results(fn_summary), sep="\t", index=False)
142+
_assert(to_test_data(fn_summary), to_test_results(fn_summary))
128143

129144

130145
if __name__ == '__main__':
0 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)