Add info-matches CLI command

jeromekelleher · jeromekelleher · commit 45548cd70637 · 2024-08-27T15:44:44.000+01:00
diff --git a/sc2ts/cli.py b/sc2ts/cli.py
@@ -1,4 +1,5 @@
 import json
+import collections
 import concurrent
 import logging
 import platform
@@ -145,6 +146,28 @@ def info_metadata(metadata, verbose, log_file):
         print(metadata_db)
 
 
+@click.command()
+@click.argument("match_db", type=click.Path(exists=True, dir_okay=False))
+@click.option("-v", "--verbose", count=True)
+@click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False))
+def info_matches(match_db, verbose, log_file):
+    """
+    Information about an alignment store
+    """
+    setup_logging(verbose, log_file)
+    with sc2ts.MatchDb(match_db) as db:
+        print(db)
+        print("last date = ", db.last_date())
+        print("cost\tpercent\tcount")
+        df = db.as_dataframe()
+        total = len(db)
+        hmm_cost_counter = collections.Counter(df["hmm_cost"].astype(int))
+        for cost in sorted(hmm_cost_counter.keys()):
+            count = hmm_cost_counter[cost]
+            percent = count / total * 100
+            print(f"{cost}\t{percent:.1f}\t{count}")
+
+
 def add_provenance(ts, output_file):
     # Record provenance here because this is where the arguments are provided.
     provenance = get_provenance_dict()
@@ -389,6 +412,7 @@ def cli():
 cli.add_command(import_metadata)
 cli.add_command(info_alignments)
 cli.add_command(info_metadata)
+cli.add_command(info_matches)
 
 cli.add_command(daily_extend)
 cli.add_command(validate)
diff --git a/sc2ts/inference.py b/sc2ts/inference.py
@@ -17,6 +17,7 @@
 import scipy.cluster.hierarchy
 import zarr
 import numba
+import pandas as pd
 
 from . import core
 from . import alignments
@@ -38,8 +39,21 @@ def __len__(self):
             row = self.conn.execute(sql).fetchone()
             return row["COUNT(*)"]
 
+    def as_dataframe(self):
+        with self.conn:
+            cursor = self.conn.execute(
+                "SELECT strain, match_date, hmm_cost FROM samples"
+            )
+            return pd.DataFrame(cursor.fetchall())
+
+    def last_date(self):
+        sql = "SELECT MAX(match_date) FROM samples"
+        with self.conn:
+            row = self.conn.execute(sql).fetchone()
+            return row["MAX(match_date)"]
+
     def __str__(self):
-        return "MatchDb at {self.uri} has {len(self)} samples"
+        return f"MatchDb at {self.uri} has {len(self)} samples"
 
     def __enter__(self):
         return self