Skip to content

Commit 92be78c

Browse files
Update notebook metadata and sample data manipulation
1 parent 1f51897 commit 92be78c

File tree

1 file changed

+1
-173
lines changed

1 file changed

+1
-173
lines changed
Lines changed: 1 addition & 173 deletions
Original file line numberDiff line numberDiff line change
@@ -1,173 +1 @@
1-
{
2-
"cells": [
3-
{
4-
"cell_type": "code",
5-
"execution_count": null,
6-
"id": "3b73b213-58af-4209-9efd-ac34c9e1e1d7",
7-
"metadata": {
8-
"jupyter": {
9-
"outputs_hidden": false,
10-
"source_hidden": false
11-
},
12-
"microsoft": {
13-
"language": "python",
14-
"language_group": "synapse_pyspark"
15-
},
16-
"nteract": {
17-
"transient": {
18-
"deleting": false
19-
}
20-
}
21-
},
22-
"outputs": [],
23-
"source": [
24-
"# IMPORTANT: This notebook manipulates sample data to guarantee that the Power BI report includes data for the current date, the last two days, and the last seven days. \n",
25-
"# It is OPTIONAL and is only used to ensure the Power BI report can display data during each deployment."
26-
]
27-
},
28-
{
29-
"cell_type": "code",
30-
"execution_count": null,
31-
"id": "e8e036de-0d34-4ea5-ab75-b624ddc2e220",
32-
"metadata": {
33-
"collapsed": false,
34-
"jupyter": {
35-
"outputs_hidden": false,
36-
"source_hidden": false
37-
},
38-
"microsoft": {
39-
"language": "python",
40-
"language_group": "synapse_pyspark"
41-
},
42-
"nteract": {
43-
"transient": {
44-
"deleting": false
45-
}
46-
}
47-
},
48-
"outputs": [],
49-
"source": [
50-
"%%sql\n",
51-
"--# RUN TO MOVE THE DATES FORWARD TO TODAY\n",
52-
"UPDATE ckm_conv_processed\n",
53-
"SET StartTime = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), StartTime),\n",
54-
" EndTime = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), EndTime),\n",
55-
" ConversationDate = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), ConversationDate)"
56-
]
57-
},
58-
{
59-
"cell_type": "code",
60-
"execution_count": null,
61-
"id": "82c35c12-b919-4e55-959a-2300f0412ee0",
62-
"metadata": {
63-
"jupyter": {
64-
"outputs_hidden": false,
65-
"source_hidden": false
66-
},
67-
"microsoft": {
68-
"language": "python",
69-
"language_group": "synapse_pyspark"
70-
},
71-
"nteract": {
72-
"transient": {
73-
"deleting": false
74-
}
75-
}
76-
},
77-
"outputs": [],
78-
"source": [
79-
"# This code manipulates sample data that allocates a percentage of the data\n",
80-
"# across a two weeks period to support storytelling and demo\n",
81-
"\n",
82-
"import pandas as pd\n",
83-
"from datetime import date, datetime, timedelta\n",
84-
"from pyspark.sql.functions import col\n",
85-
"\n",
86-
"df = spark.sql(\"SELECT * FROM ckm_conv_processed\")\n",
87-
"\n",
88-
"# Convert string columns to timestamp types\n",
89-
"df = df.withColumn(\"StartTime\", col(\"StartTime\").cast(\"timestamp\"))\n",
90-
"df = df.withColumn(\"EndTime\", col(\"EndTime\").cast(\"timestamp\"))\n",
91-
"df = df.withColumn(\"ConversationDate\", col(\"ConversationDate\").cast(\"timestamp\"))\n",
92-
"\n",
93-
"dfp = df.toPandas()\n",
94-
"dfp = dfp.sample(frac=1) # Randomly shuffle the df\n",
95-
"\n",
96-
"# Following list are date weights from Today-0 to Today-13 (two weeks)\n",
97-
"weights = [30, 26, 5, 5, 5, 5, 15, 2, 2, 1, 1, 1, 1, 1]\n",
98-
"dfindex = 0 # index loop through all conversations\n",
99-
"daysback = 0 # start at today and work backwards\n",
100-
"\n",
101-
"# Create a default time (e.g., noon) to use when NaT is encountered\n",
102-
"default_time = datetime.strptime('12:00:00', '%H:%M:%S').time()\n",
103-
"\n",
104-
"for row in weights:\n",
105-
" numconvos = int((row/100.00) * df.count())\n",
106-
" for i in range(numconvos):\n",
107-
" # Handle NaT values by using default time when necessary\n",
108-
" start_time = dfp.at[dfindex, 'StartTime'].time() if pd.notna(dfp.at[dfindex, 'StartTime']) else default_time\n",
109-
" end_time = dfp.at[dfindex, 'EndTime'].time() if pd.notna(dfp.at[dfindex, 'EndTime']) else default_time\n",
110-
" conv_time = dfp.at[dfindex, 'ConversationDate'].time() if pd.notna(dfp.at[dfindex, 'ConversationDate']) else default_time\n",
111-
" \n",
112-
" # Combine dates with times\n",
113-
" dfp.at[dfindex, 'StartTime'] = datetime.combine(date.today() - timedelta(days=daysback), start_time)\n",
114-
" dfp.at[dfindex, 'EndTime'] = datetime.combine(date.today() - timedelta(days=daysback), end_time)\n",
115-
" dfp.at[dfindex, 'ConversationDate'] = datetime.combine(date.today() - timedelta(days=daysback), conv_time)\n",
116-
" \n",
117-
" dfindex += 1\n",
118-
" daysback += 1\n",
119-
"\n",
120-
"# Convert back to Spark DataFrame and save\n",
121-
"df = spark.createDataFrame(dfp)\n",
122-
"df.write.format('delta').mode('overwrite').option(\"overwriteSchema\", \"true\").saveAsTable('ckm_conv_processed_temp')\n",
123-
"df = spark.sql(\"SELECT * FROM ckm_conv_processed_temp\")\n",
124-
"df.write.format('delta').mode('overwrite').option(\"overwriteSchema\", \"false\").saveAsTable('ckm_conv_processed')"
125-
]
126-
}
127-
],
128-
"metadata": {
129-
"dependencies": {
130-
"lakehouse": {
131-
"default_lakehouse": "e6ad9dad-e3da-4da5-bca6-6572c466b69a",
132-
"default_lakehouse_name": "ckm_lakehouse",
133-
"default_lakehouse_workspace_id": "0d98d480-171b-4b4d-a8e7-80fbd031d1a6",
134-
"known_lakehouses": [
135-
{
136-
"id": "e6ad9dad-e3da-4da5-bca6-6572c466b69a"
137-
}
138-
]
139-
}
140-
},
141-
"kernel_info": {
142-
"name": "synapse_pyspark"
143-
},
144-
"kernelspec": {
145-
"display_name": "Synapse PySpark",
146-
"language": "Python",
147-
"name": "synapse_pyspark"
148-
},
149-
"language_info": {
150-
"name": "python"
151-
},
152-
"microsoft": {
153-
"language": "python",
154-
"language_group": "synapse_pyspark",
155-
"ms_spell_check": {
156-
"ms_spell_check_language": "en"
157-
}
158-
},
159-
"nteract": {
160-
"version": "[email protected]"
161-
},
162-
"spark_compute": {
163-
"compute_id": "/trident/default"
164-
},
165-
"synapse_widget": {
166-
"state": {},
167-
"version": "0.1"
168-
},
169-
"widgets": {}
170-
},
171-
"nbformat": 4,
172-
"nbformat_minor": 5
173-
}
1+
{"cells":[{"cell_type":"code","execution_count":null,"id":"3b73b213-58af-4209-9efd-ac34c9e1e1d7","metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["# IMPORTANT: This notebook manipulates sample data to guarantee that the Power BI report includes data for the current date, the last two days, and the last seven days. \n","# It is OPTIONAL and is only used to ensure the Power BI report can display data during each deployment."]},{"cell_type":"code","execution_count":null,"id":"e8e036de-0d34-4ea5-ab75-b624ddc2e220","metadata":{"collapsed":false,"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["%%sql\n","--# RUN TO MOVE THE DATES FORWARD TO TODAY\n","UPDATE ckm_conv_processed\n","SET StartTime = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), StartTime),\n"," EndTime = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), EndTime),\n"," ConversationDate = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), ConversationDate)"]},{"cell_type":"code","execution_count":null,"id":"82c35c12-b919-4e55-959a-2300f0412ee0","metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["# This code manipulates sample data that allocates a percentage of the data\n","# across a two weeks period to support storytelling and demo\n","\n","import pandas as pd\n","from datetime import date, datetime, timedelta\n","from pyspark.sql.functions import col\n","\n","df = spark.sql(\"SELECT * FROM ckm_conv_processed\")\n","\n","# Convert string columns to timestamp types\n","df = df.withColumn(\"StartTime\", col(\"StartTime\").cast(\"timestamp\"))\n","df = df.withColumn(\"EndTime\", col(\"EndTime\").cast(\"timestamp\"))\n","df = df.withColumn(\"ConversationDate\", col(\"ConversationDate\").cast(\"timestamp\"))\n","\n","dfp = df.toPandas()\n","dfp = dfp.sample(frac=1) # Randomly shuffle the df\n","\n","# Following list are date weights from Today-0 to Today-13 (two weeks)\n","weights = [30, 26, 5, 5, 5, 5, 15, 2, 2, 1, 1, 1, 1, 1]\n","dfindex = 0 # index loop through all conversations\n","daysback = 0 # start at today and work backwards\n","\n","# Create a default time (e.g., noon) to use when NaT is encountered\n","default_time = datetime.strptime('12:00:00', '%H:%M:%S').time()\n","\n","for row in weights:\n"," numconvos = int((row/100.00) * df.count())\n"," for i in range(numconvos):\n"," # Handle NaT values by using default time when necessary\n"," start_time = dfp.at[dfindex, 'StartTime'].time() if pd.notna(dfp.at[dfindex, 'StartTime']) else default_time\n"," end_time = dfp.at[dfindex, 'EndTime'].time() if pd.notna(dfp.at[dfindex, 'EndTime']) else default_time\n"," conv_time = dfp.at[dfindex, 'ConversationDate'].time() if pd.notna(dfp.at[dfindex, 'ConversationDate']) else default_time\n"," \n"," # Combine dates with times\n"," dfp.at[dfindex, 'StartTime'] = datetime.combine(date.today() - timedelta(days=daysback), start_time)\n"," dfp.at[dfindex, 'EndTime'] = datetime.combine(date.today() - timedelta(days=daysback), end_time)\n"," dfp.at[dfindex, 'ConversationDate'] = datetime.combine(date.today() - timedelta(days=daysback), conv_time)\n"," \n"," dfindex += 1\n"," daysback += 1\n","\n","# Convert back to Spark DataFrame and save\n","df = spark.createDataFrame(dfp)\n","df.write.format('delta').mode('overwrite').option(\"overwriteSchema\", \"true\").saveAsTable('ckm_conv_processed_temp')\n","df = spark.sql(\"SELECT * FROM ckm_conv_processed_temp\")\n","df.write.format('delta').mode('overwrite').option(\"overwriteSchema\", \"false\").saveAsTable('ckm_conv_processed')"]}],"metadata":{"dependencies":{"lakehouse":{"default_lakehouse":"e6ad9dad-e3da-4da5-bca6-6572c466b69a","default_lakehouse_name":"ckm_lakehouse","default_lakehouse_workspace_id":"0d98d480-171b-4b4d-a8e7-80fbd031d1a6","known_lakehouses":[{"id":"e6ad9dad-e3da-4da5-bca6-6572c466b69a"}]}},"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"display_name":"Synapse PySpark","language":"Python","name":"synapse_pyspark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"[email protected]"},"spark_compute":{"compute_id":"/trident/default"},"synapse_widget":{"state":{},"version":"0.1"},"widgets":{}},"nbformat":4,"nbformat_minor":5}

0 commit comments

Comments
 (0)