computational-metabolomics
diff --git a/‎beams/annotation.py‎
Lines changed: 6 additions & 4 deletions b/‎beams/annotation.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎beams/in_out.py‎
Lines changed: 18 additions & 3 deletions b/‎beams/in_out.py‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎tests/test_annotation.py‎
Lines changed: 23 additions & 8 deletions b/‎tests/test_annotation.py‎
Lines changed: 23 additions & 8 deletions
diff --git a/‎tests/test_data/results_annotation.sqlite‎
0 Bytes b/‎tests/test_data/results_annotation.sqlite‎
0 Bytes
@@ -683,6 +683,8 @@ def annotate_compounds(peaklist, lib_adducts, ppm, db_out, db_name, db_in=""):
                     cursor_local.executescript(db_dump.read().decode('utf-8'))
                     conn_local.commit()
 
+                    cursor_local.execute("CREATE INDEX idx_exact_mass ON {} (exact_mass)".format(db_name.replace(".sql.gz", "")))
+
                     cursor_local.execute("SELECT name FROM sqlite_master WHERE type='table'")
                     if (db_name.replace(".sql.gz", ""), ) not in cursor_local.fetchall():
                         raise ValueError("Database {} not available".format(db_name))
@@ -882,7 +884,7 @@ def annotate_drug_products(peaklist, db_out, list_smiles, lib_adducts, ppm, phas
             comp = pyteomics_mass.Composition(mf)
             record.update(comp)
             record["molecular_formula"] = composition_to_string(comp)
-            record["exact_mass"] = round(pyteomics_mass.calculate_mass(formula=str(mf), mass_data=nist_db), 6)
+            record["exact_mass"] = round(pyteomics_mass.calculate_mass(formula=str(mf), mass_data=nist_database), 6)
             record["CHNOPS"] = sum([comp[e] for e in comp if e in ["C", "H", "N", "O", "P", "S"]]) == sum(list(comp.values()))
             records.append(record)
 
@@ -919,7 +921,7 @@ def summary(df, db, single_row=False, single_column=False, convert_rt=None, ndig
     cursor = conn.cursor()
 
     cursor.execute("DROP TABLE IF EXISTS peaklist")
-    df[['name', 'mz', 'rt', "intensity"]].sort_values(by=["rt", "mz"]).to_sql('peaklist', conn, index=False)
+    df[["name", "mz", "rt", "intensity"]].to_sql("peaklist", conn, index=False)
 
     cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
     tables = cursor.fetchall()
@@ -1155,7 +1157,7 @@ def summary(df, db, single_row=False, single_column=False, convert_rt=None, ndig
         join_peak_labels = ""
 
     query = """CREATE TABLE summary AS SELECT
-               peaklist.name, peaklist.mz, peaklist.rt{}{}
+               peaklist.name, peaklist.mz, peaklist.rt, peaklist.intensity{}{}
                FROM peaklist
                {}
                {}
@@ -1212,7 +1214,7 @@ def summary(df, db, single_row=False, single_column=False, convert_rt=None, ndig
                              group_concat(round(ppm_error, 2), '||') as ppm_error
                              """)
 
-        query = """SELECT DISTINCT name, mz, rt, {}
+        query = """SELECT DISTINCT name, mz, rt, intensity, {}
                    from summary
                    GROUP BY NAME
                    ORDER BY rowid
 
@@ -154,7 +154,9 @@ def read_xset_matrix(fn_matrix, first_sample, separator="\t", mapping={"mz": "mz
     return pd.concat([df_peaklist, df_matrix], axis=1)
 
 
-def combine_peaklist_matrix(fn_peaklist, fn_matrix, separator="\t", mapping={"name": "name", "mz": "mz", "rt": "rt"}, merge_on="name", samples_in_columns=True):
+def combine_peaklist_matrix(fn_peaklist, fn_matrix, separator="\t", median_intensity=True,
+                            mapping={"name": "name", "mz": "mz", "rt": "rt", "intensity": "intensity"},
+                            merge_on="name", samples_in_columns=True):
     if "mz" not in mapping and "rt" not in mapping and "name" not in mapping:
         raise ValueError("Incorrect column mapping: provide column names for mz, and name")
 
@@ -176,7 +178,12 @@ def combine_peaklist_matrix(fn_peaklist, fn_matrix, separator="\t", mapping={"na
         df_peaklist.columns = ["name", "mz", "rt"]
 
         df_matrix = df_matrix.rename(columns={mapping["name"]: 'name'})
-        df_peaklist["intensity"] = pd.Series(df_matrix.median(axis=1, skipna=True), index=df_matrix.index)
+
+    if mapping["intensity"] not in df_peaklist.columns:
+        if median_intensity:
+            df_peaklist["intensity"] = pd.Series(df_matrix.median(axis=1, skipna=True), index=df_matrix.index)
+        else:
+            df_peaklist["intensity"] = pd.Series(df_matrix.mean(axis=1, skipna=True), index=df_matrix.index)
 
     if len(df_peaklist[mapping["name"]].unique()) != len(df_peaklist[mapping["name"]]):
         raise ValueError("Peaklist: Values column '{}' are not unique".format(mapping["name"]))
@@ -187,7 +194,8 @@ def combine_peaklist_matrix(fn_peaklist, fn_matrix, separator="\t", mapping={"na
 
 
 
-def read_peaklist(fn_peaklist, separator="\t", mapping={"name": "name", "mz": "mz", "rt": "rt", "intensity": "intensity"}):
+def read_peaklist(fn_peaklist, separator="\t",
+                  mapping={"name": "name", "mz": "mz", "rt": "rt", "intensity": "intensity"}):
 
     df_peaklist = pd.read_csv(fn_peaklist, header=0, sep=separator, dtype={"name": str}, float_precision="round_trip")
     if mapping["mz"] not in df_peaklist.columns.values or mapping["intensity"] not in df_peaklist.columns.values:
@@ -200,9 +208,12 @@ def read_peaklist(fn_peaklist, separator="\t", mapping={"name": "name", "mz": "m
             df_peaklist.columns = ["mz", "intensity"]
             df_peaklist.insert(0, "name", [str(x).replace(".","_") for x in df_peaklist[mapping["mz"]]])
             df_peaklist["mz"] = df_peaklist["mz"].astype(float)
+            df_peaklist["intensity"] = df_peaklist["intensity"].astype(float)
         else:
             df_peaklist = df_peaklist[[mapping["name"], mapping["mz"], mapping["intensity"]]]
             df_peaklist.columns = ["name", "mz", "intensity"]
+            df_peaklist["mz"] = df_peaklist["mz"].astype(float)
+            df_peaklist["intensity"] = df_peaklist["intensity"].astype(float)
         df_peaklist.insert(2, "rt", 0.0)
     elif "rt" in mapping:
         if mapping["name"] in df_peaklist.columns.values:
@@ -223,4 +234,8 @@ def read_peaklist(fn_peaklist, separator="\t", mapping={"name": "name", "mz": "m
         df_peaklist = df_peaklist[[mapping["name"], mapping["mz"], mapping["rt"], mapping["intensity"]]]
         df_peaklist.columns = ["name", "mz", "rt", "intensity"]
 
+    df_peaklist["mz"] = df_peaklist["mz"].astype(float)
+    df_peaklist["rt"] = df_peaklist["rt"].astype(float)
+    df_peaklist["intensity"] = df_peaklist["intensity"].astype(float)
+
     return df_peaklist
@@ -116,15 +116,30 @@ def test_annotate_molecular_formulae(self):
         self.assertEqual(sqlite_count(to_test_results(self.db_results), "molecular_formulae"), 16)
 
     def test_summary(self):
+
+        def _assert(summary_test_data, summary_result):
+            with open(summary_result) as result:
+                with open(summary_test_data) as test_data:
+                    lines_results = result.read().splitlines()
+                    lines_test_data = test_data.read().splitlines()
+                    for i in range(len(lines_results)):
+                        self.assertTrue(lines_results[i], lines_test_data[i])
+                        self.assertEqual(sqlite_records(to_test_results(self.db_results), "summary"), sqlite_records(to_test_data(self.db_results), "summary"))
+
+        fn_summary = "summary_mr_mc.txt"
         df_summary = summary(self.df, to_test_results(self.db_results), single_row=False, single_column=False, convert_rt=None, ndigits_mz=None)
-        df_summary.to_csv(to_test_results("summary.txt"), sep="\t", index=False)
-        with open(to_test_results("summary.txt")) as result:
-            with open(to_test_data("summary.txt")) as test_data:
-                lines_results = result.read().splitlines()
-                lines_test_data = test_data.read().splitlines()
-                for i in range(len(lines_results)):
-                    self.assertTrue(lines_results[i], lines_test_data[i])
-                    self.assertEqual(sqlite_records(to_test_results(self.db_results), "summary"), sqlite_records(to_test_data(self.db_results), "summary"))
+        df_summary.to_csv(to_test_results(fn_summary), sep="\t", index=False)
+        _assert(to_test_data(fn_summary), to_test_results(fn_summary))
+
+        fn_summary = "summary_sr_mc.txt"
+        df_summary = summary(self.df, to_test_results(self.db_results), single_row=True, single_column=False, convert_rt=None, ndigits_mz=None)
+        df_summary.to_csv(to_test_results(fn_summary), sep="\t", index=False)
+        _assert(to_test_data(fn_summary), to_test_results(fn_summary))
+
+        fn_summary = "summary_sr_sc.txt"
+        df_summary = summary(self.df, to_test_results(self.db_results), single_row=True, single_column=True, convert_rt=None, ndigits_mz=None)
+        df_summary.to_csv(to_test_results(fn_summary), sep="\t", index=False)
+        _assert(to_test_data(fn_summary), to_test_results(fn_summary))
 
 
 if __name__ == '__main__':