Skip to content

Commit 7a68c7a

Browse files
authored
Use Spark 3 in Data Explorer Sample (#98)
Also: - automatically generate PAT token for Databricks - added README about continuous export for Data Explorer
1 parent c96e95c commit 7a68c7a

File tree

7 files changed

+69
-52
lines changed

7 files changed

+69
-52
lines changed

components/azure-databricks/create-databricks.sh

Lines changed: 25 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -28,61 +28,40 @@ fi
2828

2929
databricks_metainfo=$(az resource show -g $RESOURCE_GROUP --resource-type Microsoft.Databricks/workspaces -n $ADB_WORKSPACE -o json)
3030

31+
# Databricks CLI automatically picks up configuration from $DATABRICKS_HOST and $DATABRICKS_TOKEN.
32+
export DATABRICKS_HOST=$(jq -r '"https://" + .location + ".azuredatabricks.net"' <<<"$databricks_metainfo")
33+
3134
echo 'creating Key Vault to store Databricks PAT token'
3235
az keyvault create -g $RESOURCE_GROUP -n $ADB_TOKEN_KEYVAULT -o tsv >>log.txt
3336

3437
echo 'checking PAT token secret presence in Key Vault'
3538
databricks_token_secret_name="DATABRICKS-TOKEN"
3639
pat_token_secret=$(az keyvault secret list --vault-name $ADB_TOKEN_KEYVAULT --query "[?ends_with(id, '/$databricks_token_secret_name')].id" -o tsv)
3740
if [[ -z "$pat_token_secret" ]]; then
38-
echo 'PAT token secret not present. Creating dummy entry for user to fill in manually'
39-
az keyvault secret set --vault-name $ADB_TOKEN_KEYVAULT -n "$databricks_token_secret_name" --file /dev/null -o tsv >>log.txt
41+
echo 'generating PAT token'
42+
wsId=$(jq -r .id <<<"$databricks_metainfo")
43+
44+
# Get a token for the global Databricks application.
45+
# The resource name is fixed and never changes.
46+
token_response=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d)
47+
token=$(jq .accessToken -r <<< "$token_response")
48+
49+
# Get a token for the Azure management API
50+
token_response=$(az account get-access-token --resource https://management.core.windows.net/)
51+
azToken=$(jq .accessToken -r <<< "$token_response")
52+
53+
api_response=$(curl -sf "$DATABRICKS_HOST/api/2.0/token/create" \
54+
-H "Authorization: Bearer $token" \
55+
-H "X-Databricks-Azure-SP-Management-Token:$azToken" \
56+
-H "X-Databricks-Azure-Workspace-Resource-Id:$wsId" \
57+
-d '{ "lifetime_seconds": 864000, "comment": "streaming-at-scale generated token" }')
58+
pat_token=$(jq .token_value -r <<< "$api_response")
59+
60+
az keyvault secret set --vault-name "$ADB_TOKEN_KEYVAULT" --name "$databricks_token_secret_name" --value "$pat_token"
4061
fi
4162

42-
echo 'checking PAT token presence in Key Vault'
43-
pat_token=$(az keyvault secret show --vault-name $ADB_TOKEN_KEYVAULT -n "$databricks_token_secret_name" --query value -o tsv)
44-
45-
if [[ -z "$pat_token" ]]; then
46-
echo 'PAT token not present. Requesting user to fill in manually'
47-
databricks_login_url=$(jq -r '"https://" + .location + ".azuredatabricks.net/aad/auth?has=&Workspace=" + .id + "&WorkspaceResourceGroupUri="+ .properties.managedResourceGroupId' <<<"$databricks_metainfo")
48-
49-
kv_info=$(az resource show -g $RESOURCE_GROUP --resource-type Microsoft.KeyVault/vaults -n $ADB_TOKEN_KEYVAULT -o json)
50-
kv_secrets_url=$(jq -r '"https://portal.azure.com/#@" + .properties.tenantId + "/resource" + .id + "/secrets"' <<<$kv_info)
51-
52-
cat <<EOM
53-
ERROR: Missing PAT token in Key Vault (this is normal the first time you run this script).
54-
55-
You need to manually create a Databricks PAT token and register it into the Key Vault as follows,
56-
then rerun this script or pipeline.
57-
58-
- Navigate to:
59-
$databricks_login_url
60-
Create a PAT token and copy it to the clipboard:
61-
https://docs.azuredatabricks.net/api/latest/authentication.html#generate-a-token
62-
- Navigate to:
63-
$kv_secrets_url
64-
Click $databricks_token_secret_name
65-
Click "+ New Version"
66-
As value, enter the PAT token you copied
67-
Click Create
68-
- The script will wait for the PAT to be copied into the Key Vault
69-
If you stop the script, you can resume it running the following command:
70-
./create-solution.sh -d "$PREFIX" -t $TESTTYPE -s PT
71-
72-
EOM
73-
74-
echo 'waiting for PAT (polling every 5 secs)...'
75-
while : ; do
76-
pat_token=$(az keyvault secret show --vault-name "$ADB_TOKEN_KEYVAULT" --name "$databricks_token_secret_name" --query value -o tsv | grep dapi || true)
77-
if [ ! -z "$pat_token" ]; then break; fi
78-
sleep 5
79-
done
80-
echo 'PAT detected'
81-
fi
82-
83-
# Databricks CLI automatically picks up configuration from these two environment variables.
84-
export DATABRICKS_HOST=$(jq -r '"https://" + .location + ".azuredatabricks.net"' <<<"$databricks_metainfo")
85-
export DATABRICKS_TOKEN="$pat_token"
63+
echo 'getting PAT token from Key Vault'
64+
export DATABRICKS_TOKEN=$(az keyvault secret show --vault-name $ADB_TOKEN_KEYVAULT -n "$databricks_token_secret_name" --query value -o tsv)
8665

8766
fi
8867
echo 'checking Databricks secrets scope exists'

eventhubs-dataexplorer/README.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,3 +145,36 @@ To remove all the created resource, you can just delete the related resource gro
145145
```bash
146146
az group delete -n <resource-group-name>
147147
```
148+
149+
## Next steps
150+
151+
Retaining long-term data in Azure Data Explorer can drive up costs. You can set up [continuous data export](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/management/data-export/continuous-data-export) to save derivations from ingested data into storage. In conjunction with a [retention policy](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/management/retentionpolicy), this allows data tiering, serving hot data from Data Explorer's own storage, and colder data through the external table.
152+
153+
The sample statements below use CSV files in storage blob for simplicity. Use Parquet instead to improve file size and access performance, especially if planning to query data from the external table. Use Azure Data Lake Storage Gen2 instead of blob for improved performance and to avoid the need for hard-coded credentials.
154+
155+
156+
```kql
157+
.create external table SummarizedEvents (deviceId: string, type: string, count:long, from:datetime, to:datetime)
158+
kind=blob
159+
dataformat=csv
160+
(
161+
h@'https://<SOLUTION_NAME>storage.blob.core.windows.net/export;<STORAGE_KEY>'
162+
)
163+
164+
.create function
165+
EventSummary()
166+
{
167+
EventTable
168+
| summarize count=count(), from=min(createdAt), to=max(createdAt) by deviceId, type
169+
}
170+
171+
// Create the target table (if it doesn't already exist)
172+
.set-or-append SummarizedEvents <| EventSummary() | limit 0
173+
174+
.create-or-alter continuous-export SummarizedEventsExport
175+
to table SummarizedEvents
176+
with
177+
(intervalBetweenRuns=5m)
178+
<| EventSummary()
179+
180+
```

eventhubs-dataexplorer/create-solution.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,13 @@ if [[ -z "$PREFIX" ]]; then
4747
usage
4848
fi
4949

50+
export DATABRICKS_SPARKVERSION=7.3.x-scala2.12
51+
5052
# 10000 messages/sec
5153
if [ "$TESTTYPE" == "10" ]; then
5254
export EVENTHUB_PARTITIONS=12
5355
export EVENTHUB_CAPACITY=12
54-
export DATAEXPLORER_SKU=D13_v2
56+
export DATAEXPLORER_SKU=Standard_D13_v2
5557
export DATAEXPLORER_CAPACITY=3
5658
export SIMULATOR_INSTANCES=5
5759
fi
@@ -60,7 +62,7 @@ fi
6062
if [ "$TESTTYPE" == "5" ]; then
6163
export EVENTHUB_PARTITIONS=8
6264
export EVENTHUB_CAPACITY=6
63-
export DATAEXPLORER_SKU=D12_v2
65+
export DATAEXPLORER_SKU=Standard_D12_v2
6466
export DATAEXPLORER_CAPACITY=2
6567
export SIMULATOR_INSTANCES=3
6668
fi
@@ -69,7 +71,7 @@ fi
6971
if [ "$TESTTYPE" == "1" ]; then
7072
export EVENTHUB_PARTITIONS=2
7173
export EVENTHUB_CAPACITY=2
72-
export DATAEXPLORER_SKU=D11_v2
74+
export DATAEXPLORER_SKU=Standard_D11_v2
7375
export DATAEXPLORER_CAPACITY=2
7476
export SIMULATOR_INSTANCES=1
7577
fi
@@ -169,6 +171,7 @@ echo "***** [V] Starting deployment VERIFICATION"
169171

170172
RUN=`echo $STEPS | grep V -o || true`
171173
if [ ! -z "$RUN" ]; then
174+
source ../assert/has-local-databrickscli.sh
172175
source ../components/azure-databricks/create-databricks.sh
173176
source ../streaming/databricks/runners/verify-dataexplorer.sh
174177
fi

streaming/databricks/job/run-databricks-job.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ wait_for_run () {
3939
cluster_jq_command="$(cat <<JQ
4040
.name = "Streaming at scale job $notebook_name"
4141
| .notebook_task.notebook_path = "/Shared/streaming-at-scale/$notebook_name"
42+
| .new_cluster.spark_version = "$DATABRICKS_SPARKVERSION"
4243
| .new_cluster.node_type_id = "$DATABRICKS_NODETYPE"
4344
| .new_cluster.num_workers = $DATABRICKS_WORKERS
4445
| .timeout_seconds = $((${REPORT_THROUGHPUT_MINUTES:-30} * 60))

streaming/databricks/notebooks/verify-dataexplorer.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ val cluster = dbutils.widgets.get("dataexplorer-cluster")
2222
val database = dbutils.widgets.get("dataexplorer-database")
2323
val query = dbutils.widgets.get("dataexplorer-query")
2424
val conf: Map[String, String] = Map(
25-
KustoSourceOptions.KUSTO_AAD_CLIENT_ID -> dbutils.widgets.get("dataexplorer-client-id"),
26-
KustoSourceOptions.KUSTO_AAD_CLIENT_PASSWORD -> dbutils.secrets.get(scope = "MAIN", key = "dataexplorer-client-password"),
25+
KustoSourceOptions.KUSTO_AAD_APP_ID -> dbutils.widgets.get("dataexplorer-client-id"),
26+
KustoSourceOptions.KUSTO_AAD_APP_SECRET -> dbutils.secrets.get(scope = "MAIN", key = "dataexplorer-client-password"),
2727
KustoSourceOptions.KUSTO_BLOB_STORAGE_ACCOUNT_NAME -> dbutils.widgets.get("dataexplorer-storage-account"),
2828
KustoSourceOptions.KUSTO_BLOB_STORAGE_ACCOUNT_KEY -> dbutils.secrets.get(scope = "MAIN", key = "dataexplorer-storage-key"),
2929
KustoSourceOptions.KUSTO_BLOB_CONTAINER -> dbutils.widgets.get("dataexplorer-storage-container")

streaming/databricks/runners/verify-common.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
export DATABRICKS_NODETYPE=Standard_F4s
22
export DATABRICKS_WORKERS=2
3+
export DATABRICKS_SPARKVERSION=${DATABRICKS_SPARKVERSION:-5.5.x-scala2.11}
34
export DATABRICKS_MAXEVENTSPERTRIGGER=10000
45

56
export DATABRICKS_TESTOUTPUTPATH=dbfs:/test-output/$(uuidgen)

streaming/databricks/runners/verify-dataexplorer.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ databricks secrets put --scope "MAIN" --key "dataexplorer-client-password" --str
2020
databricks secrets put --scope "MAIN" --key "dataexplorer-storage-key" --string-value "$AZURE_STORAGE_KEY"
2121

2222
source ../streaming/databricks/job/run-databricks-job.sh verify-dataexplorer true "$(cat <<JQ
23-
.libraries += [ { "maven": { "coordinates": "com.microsoft.azure.kusto:spark-kusto-connector:1.0.0-BETA-04", "exclusions": ["javax.mail:mail"] } } ]
23+
.libraries += [ { "maven": { "coordinates": "com.microsoft.azure.kusto:kusto-spark_3.0_2.12:2.3.0" } } ]
2424
| .notebook_task.base_parameters."test-output-path" = "$DATABRICKS_TESTOUTPUTPATH"
2525
| .notebook_task.base_parameters."dataexplorer-cluster" = "$kustoURL"
2626
| .notebook_task.base_parameters."dataexplorer-database" = "$DATAEXPLORER_DATABASE"

0 commit comments

Comments
 (0)