Update docs (#77)

edurdevic · web-flow · commit 7c462df76362 · 2023-10-04T09:44:08.000+02:00
* refactored examples for with_sql

* Fixed unit tests

* Updated readme

* Updated display description
diff --git a/README.md b/README.md
@@ -48,8 +48,8 @@ As an illustration, consider the scenario where you need to retrieve a single ro
 
 ```
 dx.from_tables("dev_*.*.*sample*")\
-  .apply_sql("SELECT to_json(struct(*)) AS row FROM {full_table_name} LIMIT 1")\
-  .execute()
+  .with_sql("SELECT to_json(struct(*)) AS row FROM {full_table_name} LIMIT 1")\
+  .apply()
 ```
 
 ## Available functionality
@@ -59,7 +59,7 @@ The available `dx` functions are
 * `from_tables("<catalog>.<schema>.<table>")` selects tables based on the specified pattern (use `*` as a wildcard). Returns a `DataExplorer` object with methods
   * `having_columns` restricts the selection to tables that have the specified columns
   * `with_concurrency` defines how many queries are executed concurrently (10 by defailt)
-  * `apply_sql` applies a SQL template to all tables. After this command you can apply an [action](#from_tables-actions). See in-depth documentation [here](docs/Arbitrary_multi-table_SQL.md).
+  * `with_sql` applies a SQL template to all tables. After this command you can apply an [action](#from_tables-actions). See in-depth documentation [here](docs/Arbitrary_multi-table_SQL.md).
   * `unpivot_string_columns` returns a melted (unpivoted) dataframe with all string columns from the selected tables. After this command you can apply an [action](#from_tables-actions)
   * `scan` (experimental) scans the lakehouse with regex expressions defined by the rules and to power the semantic classification.
 * `intro` gives an introduction to the library
@@ -72,12 +72,11 @@ The available `dx` functions are
 
 ### from_tables Actions
 
-After a `apply_sql` or `unpivot_string_columns` command, you can apply the following actions:
+After a `with_sql` or `unpivot_string_columns` command, you can apply the following actions:
 
 * `explain` explains the queries that would be executed
-* `execute` executes the queries and shows the result in a unioned dataframe
-* `to_union_dataframe` unions all the dataframes that result from the queries
-
+* `display` executes the queries and shows the first 1000 rows of the result in a unioned dataframe
+* `apply` returns a unioned dataframe with the result from the queries
 
 ## Requirements
 
diff --git a/discoverx/dx.py b/discoverx/dx.py
@@ -68,7 +68,7 @@ def intro(self):
         <p>
             Then you can apply the following operations
             <ul>
-                <li><code>.apply_sql(...)</code> - Runs a SQL template on each table</li>
+                <li><code>.with_sql(...)</code> - Runs a SQL template on each table</li>
                 <li><code>.scan(...)</code> - Scan your lakehouse for columns matching the given rules</li>
                 <li><code>.search(...)</code> - Search your lakehouse for columns matching the given search term</li>
             </ul>
diff --git a/discoverx/explorer.py b/discoverx/explorer.py
@@ -113,7 +113,7 @@ def unpivot_string_columns(self, sample_size=None) -> "DataExplorerActions":
         if sample_size is not None:
             sql_query_template += f"TABLESAMPLE ({sample_size} ROWS)"
 
-        return self.apply_sql(sql_query_template)
+        return self.with_sql(sql_query_template)
 
     def scan(
         self,
diff --git a/docs/Arbitrary_multi-table_SQL.md b/docs/Arbitrary_multi-table_SQL.md
@@ -10,8 +10,8 @@ For example, to vacuum all the tables in "default" catalog:
 
 ```
 dx.from_tables("default.*.*")\
-  .apply_sql("VACUUM {full_table_name}")\
-  .execute()
+  .with_sql("VACUUM {full_table_name}")\
+  .display()
 ```
 
 That will apply the SQL template `VACUUM {full_table_name}` to all tables matched by the pattern `default.*.*`.
@@ -26,7 +26,7 @@ You can use the `explain()` command to see the SQL that would be executed.
 
 ```
 dx.from_tables("default.*.*")\
-  .apply_sql("VACUUM {full_table_name}")\
+  .with_sql("VACUUM {full_table_name}")\
   .explain()
 ```
 
@@ -35,14 +35,14 @@ You can also filter tables that have a specific column name.
 ```
 dx.from_tables("default.*.*")\
   .having_columns("device_id")\
-  .apply_sql("OPTIMIZE {full_table_name} ZORDER BY (`device_id`)")\
-  .execute()
+  .with_sql("OPTIMIZE {full_table_name} ZORDER BY (`device_id`)")\
+  .display()
 ```
 
 ## Select entire rows as json
 
 ```
 dx.from_tables("default.*.*")\
-  .apply_sql("SELECT to_json(struct(*)) AS json_row FROM {full_table_name}")\
-  .execute()
+  .with_sql("SELECT to_json(struct(*)) AS json_row FROM {full_table_name}")\
+  .display()
 ```
diff --git a/docs/GDPR_RoA.md b/docs/GDPR_RoA.md
@@ -9,6 +9,10 @@ For example, if you want to get all data for user `1` from all tables that have
 ```
 df = dx.from_tables("*.*.*")\
   .having_columns("user_id")\
-  .apply_sql("SELECT `user_id`, to_json(struct(*)) AS row_content FROM {full_table_name} WHERE `user_id` = 1")\
-  .to_union_dataframe()
-```
+  .with_sql("SELECT `user_id`, to_json(struct(*)) AS row_content FROM {full_table_name} WHERE `user_id` = 1")\
+  .apply()
+```
+
+### Limitations
+
+The current approach only selects tables that contain the specified column, and does not recursively follow the relationships with other tables.
diff --git a/docs/GDPR_RoE.md b/docs/GDPR_RoE.md
@@ -9,9 +9,9 @@ For example, if you want to delete users `1`, `2`, and `3` from all tables that
 ```
 dx.from_tables("*.*.*")\
   .having_columns("user_id")\
-  .apply_sql("DELETE FROM {full_table_name} WHERE `user_id` IN (1, 2, 3)"")\
-  .execute() 
-  # You can use .explain() instead of .execute() to preview the generated SQL 
+  .with_sql("DELETE FROM {full_table_name} WHERE `user_id` IN (1, 2, 3)"")\
+  .display() 
+  # You can use .explain() instead of .display() to preview the generated SQL 
 ```
 
 ## Vaccum
diff --git a/docs/Vacuum.md b/docs/Vacuum.md
@@ -8,8 +8,8 @@ With DiscoverX you can vacuum all the tables at once with the command:
 
 ```
 dx.from_tables("*.*.*")\
-  .apply_sql("VACUUM {full_table_name}")\
-  .execute()
+  .with_sql("VACUUM {full_table_name}")\
+  .display()
 ```
 
 You can schedule [this example notebook](https://raw.githubusercontent.com/databrickslabs/discoverx/master/examples/vacuum_multiple_tables.py) in your Databricks workflows to run vacuum periodically.
diff --git a/examples/detect_small_files.py b/examples/detect_small_files.py
@@ -7,7 +7,7 @@
 # MAGIC As a rule of thumb, if a table has more than `100` files and average file size smaller than `10 MB`, then we can consider it having too many small files.
 # MAGIC
 # MAGIC Some common causes of too many small files are:
-# MAGIC * Overpartitioning: the cardinality of the partition columns is too high 
+# MAGIC * Overpartitioning: the cardinality of the partition columns is too high
 # MAGIC * Lack of scheduled maintenance operations like `OPTIMIZE`
 # MAGIC * Missing auto optimize on write
 # MAGIC
@@ -38,16 +38,13 @@
 
 from pyspark.sql.functions import col, lit
 
-dx.from_tables(from_tables)\
-  .apply_sql("DESCRIBE DETAIL {full_table_name}")\
-  .to_union_dataframe()\
-  .withColumn("average_file_size_MB", col("sizeInBytes") / col("numFiles") / 1024 / 1024)\
-  .withColumn("has_too_many_small_files", 
-              (col("average_file_size_MB") < small_file_max_size_MB) & 
-              (col("numFiles") > min_file_number))\
-  .filter("has_too_many_small_files")\
-  .display()
+dx.from_tables(from_tables).with_sql("DESCRIBE DETAIL {full_table_name}").apply().withColumn(
+    "average_file_size_MB", col("sizeInBytes") / col("numFiles") / 1024 / 1024
+).withColumn(
+    "has_too_many_small_files",
+    (col("average_file_size_MB") < small_file_max_size_MB) & (col("numFiles") > min_file_number),
+).filter(
+    "has_too_many_small_files"
+).display()
 
 # COMMAND ----------
-
-
diff --git a/examples/pii_detection_presidio.py b/examples/pii_detection_presidio.py
@@ -67,7 +67,7 @@
 unpivoted_df = (
     dx.from_tables(from_tables)
     .unpivot_string_columns(sample_size=sample_size)
-    .to_union_dataframe()
+    .apply()
     .localCheckpoint()  # Checkpointing to reduce the query plan size
 )
 
diff --git a/examples/vacuum_multiple_tables.py b/examples/vacuum_multiple_tables.py
@@ -28,7 +28,7 @@
 
 # COMMAND ----------
 
-dx.from_tables(from_tables).apply_sql("VACUUM {full_table_name}").explain()
+dx.from_tables(from_tables).with_sql("VACUUM {full_table_name}").explain()
 
 # COMMAND ----------
 
@@ -37,4 +37,4 @@
 
 # COMMAND ----------
 
-(dx.from_tables(from_tables).apply_sql("VACUUM {full_table_name}").execute())
+(dx.from_tables(from_tables).with_sql("VACUUM {full_table_name}").display())

Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@`
`67`	`67`	`unpivoted_df = (`
`68`	`68`	`dx.from_tables(from_tables)`
`69`	`69`	`.unpivot_string_columns(sample_size=sample_size)`
`70`		`- .to_union_dataframe()`
	`70`	`+ .apply()`
`71`	`71`	`.localCheckpoint() # Checkpointing to reduce the query plan size`
`72`	`72`	`)`
`73`	`73`