google
diff --git a/‎README.md‎
Lines changed: 14 additions & 6 deletions b/‎README.md‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎diversity.py‎
Lines changed: 48 additions & 16 deletions b/‎diversity.py‎
Lines changed: 48 additions & 16 deletions
diff --git a/‎meterstick_custom_metrics.ipynb‎
Lines changed: 16 additions & 27 deletions b/‎meterstick_custom_metrics.ipynb‎
Lines changed: 16 additions & 27 deletions
diff --git a/‎meterstick_demo.ipynb‎
Lines changed: 17 additions & 9 deletions b/‎meterstick_demo.ipynb‎
Lines changed: 17 additions & 9 deletions
@@ -7,7 +7,6 @@ routine data analysis tasks. Please see [meterstick_demo.ipynb](https://colab.re
 
 This is not an officially supported Google product.
 
-
 ## tl;dr
 
 Modify the demo colab [notebook](https://colab.research.google.com/github/google/meterstick/blob/master/meterstick_demo.ipynb) and adapt it to your needs.
@@ -41,7 +40,6 @@ This calculates the percent change in conversion rate and bounce rate,
 relative to the control arm, for each country and device, together with
 95% confidence intervals based on jackknife standard errors.
 
-
 ## Building Blocks of an Analysis Object
 
 ### Metrics
@@ -249,7 +247,6 @@ metrics for non-spam clicks you can add a `where` clause to the Metric or
 MetricList. This clause is a boolean expression which can be passed to pandas'
 [query() method](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html).
 
-
 ```python
 sum_non_spam_clicks = Sum("Clicks", where="~IsSpam")
 MetricList([Sum("Clicks"), Sum("Conversions")], where="~IsSpam")
@@ -308,9 +305,7 @@ It can help you to sanity check complex Metrics.
 
 You can get the SQL query for all built-in Metrics and Operations by calling
 `to_sql(sql_data_source, split_by)` on the Metric. `sql_data_source` could be a
-table or a subquery. The dialect it uses is the
-[standard SQL](https://cloud.google.com/bigquery/docs/reference/standard-sql)
-in Google Cloud's BigQuery. For example,
+table or a subquery. For example,
 
 ```python
 MetricList((Sum('X', where='Y > 0'), Sum('X'))).to_sql('table', 'grp')
@@ -338,6 +333,19 @@ function that can execute SQL query. The `mode` can be `None` or
 `'mixed'`. The former is recommended and computes things in SQL whenever
 possible while the latter only computes the leaf Metrics in SQL.
 
+The default dialect it uses is GoogleSQL. You can use `set_dialect()` to choose
+other dialects. Currently we support
+
+*   PostgreSQL
+*   MySQL and MariaDB
+*   SQLite
+*   Oracle
+*   Microsoft SQL Server
+*   Trino SQL
+
+For other dialects, you can manually overwrite the default string templates at
+the top of `sql.py` file.
+
 ## Apache Beam
 
 There is also a
 
@@ -26,12 +26,22 @@
 class DiversityBase(operations.Distribution):
   """Base class that captures shared logic of diversity Operations."""
 
-  def __init__(self, over, child, name_tmpl, additional_fingerprint_attrs=None):
+  def __init__(
+      self,
+      over,
+      child,
+      name_tmpl,
+      where,
+      additional_fingerprint_attrs=None,
+      **kwargs,
+  ):
     super(DiversityBase, self).__init__(
         over,
         child,
         name_tmpl,
+        where=where,
         additional_fingerprint_attrs=additional_fingerprint_attrs,
+        **kwargs
     )
     self.extra_index = []
 
@@ -59,8 +69,8 @@ def to_dataframe(self, res):
 class HHI(DiversityBase):
   """Herfindahl–Hirschman index of metric distribution."""
 
-  def __init__(self, over, child=None):
-    super(HHI, self).__init__(over, child, 'HHI of {}')
+  def __init__(self, over, child=None, where=None, **kwargs):
+    super(HHI, self).__init__(over, child, 'HHI of {}', where, **kwargs)
 
   def compute_on_children(self, child, split_by):
     dist = super(HHI, self).compute_on_children(child, split_by)
@@ -100,8 +110,8 @@ def get_sql_and_with_clause(
 class Entropy(DiversityBase):
   """Entropy of metric distribution."""
 
-  def __init__(self, over, child=None):
-    super(Entropy, self).__init__(over, child, 'Entropy of {}')
+  def __init__(self, over, child=None, where=None, **kwargs):
+    super(Entropy, self).__init__(over, child, 'Entropy of {}', where, **kwargs)
 
   def compute_on_children(self, child, split_by):
     dist = super(Entropy, self).compute_on_children(child, split_by)
@@ -141,14 +151,24 @@ def get_sql_and_with_clause(
 class TopK(DiversityBase):
   """The total share of the largest k contributors."""
 
-  def __init__(self, over, k, child=None, additional_fingerprint_attrs=None):
+  def __init__(
+      self,
+      over,
+      k,
+      child=None,
+      where=None,
+      additional_fingerprint_attrs=None,
+      **kwargs,
+  ):
     if not isinstance(k, int):
       raise ValueError('k must be an integer!')
     super(TopK, self).__init__(
         over,
         child,
         "Top-%s's share of {}" % k,
+        where,
         ['k'] + (additional_fingerprint_attrs or []),
+        **kwargs,
     )
     self.k = k
 
@@ -173,9 +193,14 @@ def get_sql_and_with_clause(
     1. Get the query for the Distribution of the child Metric.
     2. Keep all indexing/groupby columns unchanged.
     3. For all value columns, collect the top-k values into an array by
-       ARRAY_AGG(val_col ORDER BY val_col DESC LIMIT k) AS val_arr.
-    4. For all value columns, do 'SELECT SUM(x) FROM UNNEST(val_arr) AS x' to
-       get the sum of the top-k values.
+       ARRAY_AGG(val_col IGNORE NULLS ORDER BY val_col DESC LIMIT k) AS val_arr.
+       Note that the ordering between number and NULLs varies by dialect so we
+       use IGNORE NULLS.
+    4. For all value columns, do
+       'SELECT SUM(x) FROM UNNEST(val_arr) AS x WITH OFFSET AS i WHERE i < k'
+       to get the sum of the top-k values. Note that the
+       'WITH OFFSET AS i WHERE i < k' is redundant here but many external
+       dialects don't support 'LIMIT k' in #3 so we need to do it in #4.
 
     Args:
       table: The table we want to query from.
@@ -206,14 +231,14 @@ def get_sql_and_with_clause(
         continue
 
       top_k_array_col = sql.Column(
-          (c.alias, c.alias),
-          'ARRAY_AGG({} ORDER BY {} DESC LIMIT %s)' % self.k,
+          c.alias,
+          sql.ARRAY_AGG_FN(c.alias, ascending=False, dropna=True, limit=self.k),
       )
       top_k_array_col.set_alias(c.alias_raw)
       top_k_array_columns.add(top_k_array_col)
       top_k_sum_col = sql.Column(
-          top_k_array_col.alias,
-          '(SELECT SUM(x) FROM UNNEST({}) AS x)',
+          '(SELECT SUM(x) FROM'
+          f' {sql.UNNEST_ARRAY_FN(top_k_array_col.alias, "x", "i", self.k)})',
       )
       top_k_sum_col.set_alias(self.name_tmpl.format(c.alias_raw))
       top_k_sum_columns.add(top_k_sum_col)
@@ -229,7 +254,13 @@ class Nxx(DiversityBase):
   """The minimum number of contributors to achieve certain share."""
 
   def __init__(
-      self, over, share, child=None, additional_fingerprint_attrs=None
+      self,
+      over,
+      share,
+      child=None,
+      where=None,
+      additional_fingerprint_attrs=None,
+      **kwargs,
   ):
     if not 0 < share <= 1:
       raise ValueError('Share must be in (0, 1]!')
@@ -238,7 +269,9 @@ def __init__(
         child,
         'N(%s) of {}'
         % (int(100 * share) if (100 * share).is_integer() else 100 * share),
+        where,
         ['share'] + (additional_fingerprint_attrs or []),
+        **kwargs,
     )
     self.share = share
 
@@ -308,8 +341,7 @@ def get_sql_and_with_clause(
       cumsum_cols.add(cumsum_col)
 
       nxx_col = sql.Column(
-          cumsum_col.alias,
-          'COUNTIF({} < %s) + 1' % self.share,
+          cumsum_col.alias, sql.COUNTIF_FN('{} < %s' % self.share) + ' + 1'
       )
       nxx_col.set_alias(self.name_tmpl.format(c.alias_raw))
       nxx_cols.add(nxx_col)
 
@@ -78,18 +78,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
+      "execution_count": null,
       "metadata": {
-        "executionInfo": {
-          "elapsed": 52,
-          "status": "ok",
-          "timestamp": 1727469298213,
-          "user": {
-            "displayName": "",
-            "userId": ""
-          },
-          "user_tz": 420
-        },
         "id": "0UI9rAtZnBUG"
       },
       "outputs": [],
@@ -2690,17 +2680,17 @@
       "execution_count": null,
       "metadata": {
         "executionInfo": {
-          "elapsed": 60,
+          "elapsed": 59,
           "status": "ok",
-          "timestamp": 1684897961304,
+          "timestamp": 1752749803027,
           "user": {
-            "displayName": "",
-            "userId": ""
+            "displayName": "Xunmo Yang",
+            "userId": "12474546967758012552"
           },
           "user_tz": 420
         },
-        "id": "F9w6gugy3vYf",
-        "outputId": "a6853397-c2c2-49c3-a8db-294c15d18b39"
+        "id": "5dKXoRHZ_Xi4",
+        "outputId": "50aa82e5-e70c-48e3-ee48-cfe6ec7ab880"
       },
       "outputs": [
         {
@@ -2715,11 +2705,10 @@
               "SELECT\n",
               "  country,\n",
               "  SAFE_DIVIDE(sum_clicks, SUM(sum_clicks) OVER ()) AS Distribution_of_sum_clicks\n",
-              "FROM DistributionRaw\n",
-              "GROUP BY country, Distribution_of_sum_clicks"
+              "FROM DistributionRaw"
             ]
           },
-          "execution_count": 194,
+          "execution_count": 18,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -2783,12 +2772,12 @@
         "        table, indexes, global_filter, indexes, local_filter, with_data\n",
         "    )\n",
         "    child_table = sql.Datasource(child_sql, 'DistributionRaw')\n",
-        "    # Always use the alias returned by with_data.add(), because if the with_data\n",
-        "    # already holds a different table that also has 'DistributionRaw' as its\n",
-        "    # alias, we'll use a different alias for the child_table, which is returned\n",
-        "    # by with_data.add().\n",
-        "    child_table_alias = with_data.add(child_table)\n",
-        "    groupby = sql.Columns(indexes.aliases, distinct=True)\n",
+        "    # Always use the alias returned by with_data.merge(), because if the\n",
+        "    # with_data already holds a different table that also has 'DistributionRaw'\n",
+        "    # as its alias, we'll use a different alias for the child_table, which is\n",
+        "    # returned by with_data.merge().\n",
+        "    child_table_alias = with_data.merge(child_table)\n",
+        "    groupby = sql.Columns(indexes.aliases)\n",
         "    columns = sql.Columns()\n",
         "    for c in child_sql.columns:\n",
         "      if c.alias in groupby:\n",
@@ -3157,7 +3146,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": null,
       "metadata": {
         "executionInfo": {
           "elapsed": 53,
 
@@ -18595,7 +18595,16 @@
         "\n",
         "where `execute` is a function that can execute SQL queries. The return is very similar to compute_on().\n",
         "\n",
-        "The dialect it uses is the [standard SQL](https://cloud.google.com/bigquery/docs/reference/standard-sql) in Google Cloud's BigQuery.\n",
+        "The default dialect it uses is GoogleSQL. You can use `set_dialect()` to choose other dialects. Currently we support\n",
+        "\n",
+        "*   PostgreSQL\n",
+        "*   MySQL and MariaDB\n",
+        "*   SQLite\n",
+        "*   Oracle\n",
+        "*   Microsoft SQL Server\n",
+        "*   Trino SQL\n",
+        "\n",
+        "For other dialects, you can manually overwrite the default string templates at the top of `sql.py` file.\n",
         "\n",
         "The choice of `create_tmp_table_for_volatile_fn` depends on your SQL engine. If query\n",
         "```\n",
@@ -18615,17 +18624,17 @@
     {
       "metadata": {
         "executionInfo": {
-          "elapsed": 59,
+          "elapsed": 54,
           "status": "ok",
-          "timestamp": 1750186450282,
+          "timestamp": 1752474546796,
           "user": {
             "displayName": "Xunmo Yang",
             "userId": "12474546967758012552"
           },
           "user_tz": 420
         },
-        "id": "eoHY1kVlPbSL",
-        "outputId": "e4f42347-809c-492d-ec9a-a72103fd86ef"
+        "id": "U4Gef-jGn2SY",
+        "outputId": "835c7c07-c13f-44a7-886d-9382e204081a"
       },
       "cell_type": "code",
       "source": [
@@ -18637,13 +18646,13 @@
             "text/plain": [
               "SELECT\n",
               "  grp,\n",
-              "  SUM(IF(Y \u003e 0, X, 0)) AS sum_X,\n",
+              "  SUM(CASE WHEN Y \u003e 0 THEN X ELSE 0 END) AS sum_X,\n",
               "  SUM(X) AS sum_X_1\n",
               "FROM T\n",
               "GROUP BY grp"
             ]
           },
-          "execution_count": 54,
+          "execution_count": 492,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -21198,8 +21207,7 @@
           "file_id": "1u9XmuUlA0TtGmERFV1cSY-4UWpXYIJWL",
           "timestamp": 1588129678918
         }
-      ],
-      "toc_visible": true
+      ]
     },
     "kernelspec": {
       "display_name": "Python 3",