Skip to content

Commit 1f51897

Browse files
BugFix11904
1 parent a661399 commit 1f51897

File tree

1 file changed

+173
-1
lines changed

1 file changed

+173
-1
lines changed
Lines changed: 173 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,173 @@
1-
{"cells":[{"cell_type":"code","execution_count":null,"id":"3b73b213-58af-4209-9efd-ac34c9e1e1d7","metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["# IMPORTANT: This notebook manipulates sample data to guarantee that the Power BI report includes data for the current date, the last two days, and the last seven days. \n","# It is OPTIONAL and is only used to ensure the Power BI report can display data during each deployment."]},{"cell_type":"code","execution_count":null,"id":"e8e036de-0d34-4ea5-ab75-b624ddc2e220","metadata":{"collapsed":false,"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["%%sql\n","--# RUN TO MOVE THE DATES FORWARD TO TODAY\n","UPDATE ckm_conv_processed\n","SET StartTime = DATEADD(day, (SELECT DATEDIFF(NOW(), MAX(ConversationDate)) FROM ckm_conv_processed), StartTime),\n","EndTime = DATEADD(day, (SELECT DATEDIFF(NOW(), MAX(ConversationDate)) FROM ckm_conv_processed), EndTime),\n","ConversationDate = DATEADD(day, (SELECT DATEDIFF(NOW(), MAX(ConversationDate)) FROM ckm_conv_processed), ConversationDate)"]},{"cell_type":"code","execution_count":null,"id":"82c35c12-b919-4e55-959a-2300f0412ee0","metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["# This code manipulates sample data that allocates a percentage of the data\n","# across a two weeks period to support storytelling and demo\n","\n","import pandas as pd\n","from datetime import date, datetime, timedelta\n","from pyspark.sql.functions import col\n","\n","df = spark.sql(\"SELECT * FROM ckm_conv_processed\")\n","\n","# Convert string columns to timestamp types\n","df = df.withColumn(\"StartTime\", col(\"StartTime\").cast(\"timestamp\"))\n","df = df.withColumn(\"EndTime\", col(\"EndTime\").cast(\"timestamp\"))\n","df = df.withColumn(\"ConversationDate\", col(\"ConversationDate\").cast(\"timestamp\"))\n","\n","dfp = df.toPandas()\n","dfp = dfp.sample(frac=1) # This line randomly shuffles the df for a new distribution and demo percentages\n","\n","# Following list are date weights from Today-0 to Today-13 (two weeks)\n","weights = [30, 26, 5, 5, 5, 5, 15, 2, 2, 1, 1, 1, 1, 1]\n","dfindex = 0 # index loop through all conversations\n","daysback = 0 # start at today and work backwards\n","for row in weights:\n"," numconvos = int((row/100.00) * df.count())\n"," for i in range(numconvos):\n"," dfp.at[dfindex, 'StartTime'] = datetime.combine(date.today() - timedelta(days = daysback) , dfp.at[dfindex, 'StartTime'].time())\n"," dfp.at[dfindex, 'EndTime'] = datetime.combine(date.today() - timedelta(days = daysback) , dfp.at[dfindex, 'EndTime'].time())\n"," dfp.at[dfindex, 'ConversationDate'] = datetime.combine(date.today() - timedelta(days = daysback) , dfp.at[dfindex, 'ConversationDate'].time())\n"," dfindex += 1\n"," daysback += 1\n","df = spark.createDataFrame(dfp)\n","\n","# Write to temp table, then update final results table\n","df.write.format('delta').mode('overwrite').option(\"overwriteSchema\", \"true\").saveAsTable('ckm_conv_processed_temp')\n","df = spark.sql(\"SELECT * FROM ckm_conv_processed_temp \")\n","df.write.format('delta').mode('overwrite').option(\"overwriteSchema\", \"false\").saveAsTable('ckm_conv_processed')"]}],"metadata":{"dependencies":{"lakehouse":{"default_lakehouse":"e6ad9dad-e3da-4da5-bca6-6572c466b69a","default_lakehouse_name":"ckm_lakehouse","default_lakehouse_workspace_id":"0d98d480-171b-4b4d-a8e7-80fbd031d1a6","known_lakehouses":[{"id":"e6ad9dad-e3da-4da5-bca6-6572c466b69a"}]}},"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"display_name":"Synapse PySpark","language":"Python","name":"synapse_pyspark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"[email protected]"},"spark_compute":{"compute_id":"/trident/default"},"synapse_widget":{"state":{},"version":"0.1"},"widgets":{}},"nbformat":4,"nbformat_minor":5}
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "3b73b213-58af-4209-9efd-ac34c9e1e1d7",
7+
"metadata": {
8+
"jupyter": {
9+
"outputs_hidden": false,
10+
"source_hidden": false
11+
},
12+
"microsoft": {
13+
"language": "python",
14+
"language_group": "synapse_pyspark"
15+
},
16+
"nteract": {
17+
"transient": {
18+
"deleting": false
19+
}
20+
}
21+
},
22+
"outputs": [],
23+
"source": [
24+
"# IMPORTANT: This notebook manipulates sample data to guarantee that the Power BI report includes data for the current date, the last two days, and the last seven days. \n",
25+
"# It is OPTIONAL and is only used to ensure the Power BI report can display data during each deployment."
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"id": "e8e036de-0d34-4ea5-ab75-b624ddc2e220",
32+
"metadata": {
33+
"collapsed": false,
34+
"jupyter": {
35+
"outputs_hidden": false,
36+
"source_hidden": false
37+
},
38+
"microsoft": {
39+
"language": "python",
40+
"language_group": "synapse_pyspark"
41+
},
42+
"nteract": {
43+
"transient": {
44+
"deleting": false
45+
}
46+
}
47+
},
48+
"outputs": [],
49+
"source": [
50+
"%%sql\n",
51+
"--# RUN TO MOVE THE DATES FORWARD TO TODAY\n",
52+
"UPDATE ckm_conv_processed\n",
53+
"SET StartTime = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), StartTime),\n",
54+
" EndTime = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), EndTime),\n",
55+
" ConversationDate = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), ConversationDate)"
56+
]
57+
},
58+
{
59+
"cell_type": "code",
60+
"execution_count": null,
61+
"id": "82c35c12-b919-4e55-959a-2300f0412ee0",
62+
"metadata": {
63+
"jupyter": {
64+
"outputs_hidden": false,
65+
"source_hidden": false
66+
},
67+
"microsoft": {
68+
"language": "python",
69+
"language_group": "synapse_pyspark"
70+
},
71+
"nteract": {
72+
"transient": {
73+
"deleting": false
74+
}
75+
}
76+
},
77+
"outputs": [],
78+
"source": [
79+
"# This code manipulates sample data that allocates a percentage of the data\n",
80+
"# across a two weeks period to support storytelling and demo\n",
81+
"\n",
82+
"import pandas as pd\n",
83+
"from datetime import date, datetime, timedelta\n",
84+
"from pyspark.sql.functions import col\n",
85+
"\n",
86+
"df = spark.sql(\"SELECT * FROM ckm_conv_processed\")\n",
87+
"\n",
88+
"# Convert string columns to timestamp types\n",
89+
"df = df.withColumn(\"StartTime\", col(\"StartTime\").cast(\"timestamp\"))\n",
90+
"df = df.withColumn(\"EndTime\", col(\"EndTime\").cast(\"timestamp\"))\n",
91+
"df = df.withColumn(\"ConversationDate\", col(\"ConversationDate\").cast(\"timestamp\"))\n",
92+
"\n",
93+
"dfp = df.toPandas()\n",
94+
"dfp = dfp.sample(frac=1) # Randomly shuffle the df\n",
95+
"\n",
96+
"# Following list are date weights from Today-0 to Today-13 (two weeks)\n",
97+
"weights = [30, 26, 5, 5, 5, 5, 15, 2, 2, 1, 1, 1, 1, 1]\n",
98+
"dfindex = 0 # index loop through all conversations\n",
99+
"daysback = 0 # start at today and work backwards\n",
100+
"\n",
101+
"# Create a default time (e.g., noon) to use when NaT is encountered\n",
102+
"default_time = datetime.strptime('12:00:00', '%H:%M:%S').time()\n",
103+
"\n",
104+
"for row in weights:\n",
105+
" numconvos = int((row/100.00) * df.count())\n",
106+
" for i in range(numconvos):\n",
107+
" # Handle NaT values by using default time when necessary\n",
108+
" start_time = dfp.at[dfindex, 'StartTime'].time() if pd.notna(dfp.at[dfindex, 'StartTime']) else default_time\n",
109+
" end_time = dfp.at[dfindex, 'EndTime'].time() if pd.notna(dfp.at[dfindex, 'EndTime']) else default_time\n",
110+
" conv_time = dfp.at[dfindex, 'ConversationDate'].time() if pd.notna(dfp.at[dfindex, 'ConversationDate']) else default_time\n",
111+
" \n",
112+
" # Combine dates with times\n",
113+
" dfp.at[dfindex, 'StartTime'] = datetime.combine(date.today() - timedelta(days=daysback), start_time)\n",
114+
" dfp.at[dfindex, 'EndTime'] = datetime.combine(date.today() - timedelta(days=daysback), end_time)\n",
115+
" dfp.at[dfindex, 'ConversationDate'] = datetime.combine(date.today() - timedelta(days=daysback), conv_time)\n",
116+
" \n",
117+
" dfindex += 1\n",
118+
" daysback += 1\n",
119+
"\n",
120+
"# Convert back to Spark DataFrame and save\n",
121+
"df = spark.createDataFrame(dfp)\n",
122+
"df.write.format('delta').mode('overwrite').option(\"overwriteSchema\", \"true\").saveAsTable('ckm_conv_processed_temp')\n",
123+
"df = spark.sql(\"SELECT * FROM ckm_conv_processed_temp\")\n",
124+
"df.write.format('delta').mode('overwrite').option(\"overwriteSchema\", \"false\").saveAsTable('ckm_conv_processed')"
125+
]
126+
}
127+
],
128+
"metadata": {
129+
"dependencies": {
130+
"lakehouse": {
131+
"default_lakehouse": "e6ad9dad-e3da-4da5-bca6-6572c466b69a",
132+
"default_lakehouse_name": "ckm_lakehouse",
133+
"default_lakehouse_workspace_id": "0d98d480-171b-4b4d-a8e7-80fbd031d1a6",
134+
"known_lakehouses": [
135+
{
136+
"id": "e6ad9dad-e3da-4da5-bca6-6572c466b69a"
137+
}
138+
]
139+
}
140+
},
141+
"kernel_info": {
142+
"name": "synapse_pyspark"
143+
},
144+
"kernelspec": {
145+
"display_name": "Synapse PySpark",
146+
"language": "Python",
147+
"name": "synapse_pyspark"
148+
},
149+
"language_info": {
150+
"name": "python"
151+
},
152+
"microsoft": {
153+
"language": "python",
154+
"language_group": "synapse_pyspark",
155+
"ms_spell_check": {
156+
"ms_spell_check_language": "en"
157+
}
158+
},
159+
"nteract": {
160+
"version": "[email protected]"
161+
},
162+
"spark_compute": {
163+
"compute_id": "/trident/default"
164+
},
165+
"synapse_widget": {
166+
"state": {},
167+
"version": "0.1"
168+
},
169+
"widgets": {}
170+
},
171+
"nbformat": 4,
172+
"nbformat_minor": 5
173+
}

0 commit comments

Comments
 (0)