|
1 | | -{ |
2 | | - "cells": [ |
3 | | - { |
4 | | - "cell_type": "code", |
5 | | - "execution_count": null, |
6 | | - "id": "3b73b213-58af-4209-9efd-ac34c9e1e1d7", |
7 | | - "metadata": { |
8 | | - "jupyter": { |
9 | | - "outputs_hidden": false, |
10 | | - "source_hidden": false |
11 | | - }, |
12 | | - "microsoft": { |
13 | | - "language": "python", |
14 | | - "language_group": "synapse_pyspark" |
15 | | - }, |
16 | | - "nteract": { |
17 | | - "transient": { |
18 | | - "deleting": false |
19 | | - } |
20 | | - } |
21 | | - }, |
22 | | - "outputs": [], |
23 | | - "source": [ |
24 | | - "# IMPORTANT: This notebook manipulates sample data to guarantee that the Power BI report includes data for the current date, the last two days, and the last seven days. \n", |
25 | | - "# It is OPTIONAL and is only used to ensure the Power BI report can display data during each deployment." |
26 | | - ] |
27 | | - }, |
28 | | - { |
29 | | - "cell_type": "code", |
30 | | - "execution_count": null, |
31 | | - "id": "e8e036de-0d34-4ea5-ab75-b624ddc2e220", |
32 | | - "metadata": { |
33 | | - "collapsed": false, |
34 | | - "jupyter": { |
35 | | - "outputs_hidden": false, |
36 | | - "source_hidden": false |
37 | | - }, |
38 | | - "microsoft": { |
39 | | - "language": "python", |
40 | | - "language_group": "synapse_pyspark" |
41 | | - }, |
42 | | - "nteract": { |
43 | | - "transient": { |
44 | | - "deleting": false |
45 | | - } |
46 | | - } |
47 | | - }, |
48 | | - "outputs": [], |
49 | | - "source": [ |
50 | | - "%%sql\n", |
51 | | - "--# RUN TO MOVE THE DATES FORWARD TO TODAY\n", |
52 | | - "UPDATE ckm_conv_processed\n", |
53 | | - "SET StartTime = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), StartTime),\n", |
54 | | - " EndTime = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), EndTime),\n", |
55 | | - " ConversationDate = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), ConversationDate)" |
56 | | - ] |
57 | | - }, |
58 | | - { |
59 | | - "cell_type": "code", |
60 | | - "execution_count": null, |
61 | | - "id": "82c35c12-b919-4e55-959a-2300f0412ee0", |
62 | | - "metadata": { |
63 | | - "jupyter": { |
64 | | - "outputs_hidden": false, |
65 | | - "source_hidden": false |
66 | | - }, |
67 | | - "microsoft": { |
68 | | - "language": "python", |
69 | | - "language_group": "synapse_pyspark" |
70 | | - }, |
71 | | - "nteract": { |
72 | | - "transient": { |
73 | | - "deleting": false |
74 | | - } |
75 | | - } |
76 | | - }, |
77 | | - "outputs": [], |
78 | | - "source": [ |
79 | | - "# This code manipulates sample data that allocates a percentage of the data\n", |
80 | | - "# across a two weeks period to support storytelling and demo\n", |
81 | | - "\n", |
82 | | - "import pandas as pd\n", |
83 | | - "from datetime import date, datetime, timedelta\n", |
84 | | - "from pyspark.sql.functions import col\n", |
85 | | - "\n", |
86 | | - "df = spark.sql(\"SELECT * FROM ckm_conv_processed\")\n", |
87 | | - "\n", |
88 | | - "# Convert string columns to timestamp types\n", |
89 | | - "df = df.withColumn(\"StartTime\", col(\"StartTime\").cast(\"timestamp\"))\n", |
90 | | - "df = df.withColumn(\"EndTime\", col(\"EndTime\").cast(\"timestamp\"))\n", |
91 | | - "df = df.withColumn(\"ConversationDate\", col(\"ConversationDate\").cast(\"timestamp\"))\n", |
92 | | - "\n", |
93 | | - "dfp = df.toPandas()\n", |
94 | | - "dfp = dfp.sample(frac=1) # Randomly shuffle the df\n", |
95 | | - "\n", |
96 | | - "# Following list are date weights from Today-0 to Today-13 (two weeks)\n", |
97 | | - "weights = [30, 26, 5, 5, 5, 5, 15, 2, 2, 1, 1, 1, 1, 1]\n", |
98 | | - "dfindex = 0 # index loop through all conversations\n", |
99 | | - "daysback = 0 # start at today and work backwards\n", |
100 | | - "\n", |
101 | | - "# Create a default time (e.g., noon) to use when NaT is encountered\n", |
102 | | - "default_time = datetime.strptime('12:00:00', '%H:%M:%S').time()\n", |
103 | | - "\n", |
104 | | - "for row in weights:\n", |
105 | | - " numconvos = int((row/100.00) * df.count())\n", |
106 | | - " for i in range(numconvos):\n", |
107 | | - " # Handle NaT values by using default time when necessary\n", |
108 | | - " start_time = dfp.at[dfindex, 'StartTime'].time() if pd.notna(dfp.at[dfindex, 'StartTime']) else default_time\n", |
109 | | - " end_time = dfp.at[dfindex, 'EndTime'].time() if pd.notna(dfp.at[dfindex, 'EndTime']) else default_time\n", |
110 | | - " conv_time = dfp.at[dfindex, 'ConversationDate'].time() if pd.notna(dfp.at[dfindex, 'ConversationDate']) else default_time\n", |
111 | | - " \n", |
112 | | - " # Combine dates with times\n", |
113 | | - " dfp.at[dfindex, 'StartTime'] = datetime.combine(date.today() - timedelta(days=daysback), start_time)\n", |
114 | | - " dfp.at[dfindex, 'EndTime'] = datetime.combine(date.today() - timedelta(days=daysback), end_time)\n", |
115 | | - " dfp.at[dfindex, 'ConversationDate'] = datetime.combine(date.today() - timedelta(days=daysback), conv_time)\n", |
116 | | - " \n", |
117 | | - " dfindex += 1\n", |
118 | | - " daysback += 1\n", |
119 | | - "\n", |
120 | | - "# Convert back to Spark DataFrame and save\n", |
121 | | - "df = spark.createDataFrame(dfp)\n", |
122 | | - "df.write.format('delta').mode('overwrite').option(\"overwriteSchema\", \"true\").saveAsTable('ckm_conv_processed_temp')\n", |
123 | | - "df = spark.sql(\"SELECT * FROM ckm_conv_processed_temp\")\n", |
124 | | - "df.write.format('delta').mode('overwrite').option(\"overwriteSchema\", \"false\").saveAsTable('ckm_conv_processed')" |
125 | | - ] |
126 | | - } |
127 | | - ], |
128 | | - "metadata": { |
129 | | - "dependencies": { |
130 | | - "lakehouse": { |
131 | | - "default_lakehouse": "e6ad9dad-e3da-4da5-bca6-6572c466b69a", |
132 | | - "default_lakehouse_name": "ckm_lakehouse", |
133 | | - "default_lakehouse_workspace_id": "0d98d480-171b-4b4d-a8e7-80fbd031d1a6", |
134 | | - "known_lakehouses": [ |
135 | | - { |
136 | | - "id": "e6ad9dad-e3da-4da5-bca6-6572c466b69a" |
137 | | - } |
138 | | - ] |
139 | | - } |
140 | | - }, |
141 | | - "kernel_info": { |
142 | | - "name": "synapse_pyspark" |
143 | | - }, |
144 | | - "kernelspec": { |
145 | | - "display_name": "Synapse PySpark", |
146 | | - "language": "Python", |
147 | | - "name": "synapse_pyspark" |
148 | | - }, |
149 | | - "language_info": { |
150 | | - "name": "python" |
151 | | - }, |
152 | | - "microsoft": { |
153 | | - "language": "python", |
154 | | - "language_group": "synapse_pyspark", |
155 | | - "ms_spell_check": { |
156 | | - "ms_spell_check_language": "en" |
157 | | - } |
158 | | - }, |
159 | | - "nteract": { |
160 | | - |
161 | | - }, |
162 | | - "spark_compute": { |
163 | | - "compute_id": "/trident/default" |
164 | | - }, |
165 | | - "synapse_widget": { |
166 | | - "state": {}, |
167 | | - "version": "0.1" |
168 | | - }, |
169 | | - "widgets": {} |
170 | | - }, |
171 | | - "nbformat": 4, |
172 | | - "nbformat_minor": 5 |
173 | | -} |
| 1 | +{"cells":[{"cell_type":"code","execution_count":null,"id":"3b73b213-58af-4209-9efd-ac34c9e1e1d7","metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["# IMPORTANT: This notebook manipulates sample data to guarantee that the Power BI report includes data for the current date, the last two days, and the last seven days. \n","# It is OPTIONAL and is only used to ensure the Power BI report can display data during each deployment."]},{"cell_type":"code","execution_count":null,"id":"e8e036de-0d34-4ea5-ab75-b624ddc2e220","metadata":{"collapsed":false,"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["%%sql\n","--# RUN TO MOVE THE DATES FORWARD TO TODAY\n","UPDATE ckm_conv_processed\n","SET StartTime = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), StartTime),\n"," EndTime = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), EndTime),\n"," ConversationDate = DATEADD(day, (SELECT DATEDIFF(CURRENT_DATE, MAX(ConversationDate)) FROM ckm_conv_processed), ConversationDate)"]},{"cell_type":"code","execution_count":null,"id":"82c35c12-b919-4e55-959a-2300f0412ee0","metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["# This code manipulates sample data that allocates a percentage of the data\n","# across a two weeks period to support storytelling and demo\n","\n","import pandas as pd\n","from datetime import date, datetime, timedelta\n","from pyspark.sql.functions import col\n","\n","df = spark.sql(\"SELECT * FROM ckm_conv_processed\")\n","\n","# Convert string columns to timestamp types\n","df = df.withColumn(\"StartTime\", col(\"StartTime\").cast(\"timestamp\"))\n","df = df.withColumn(\"EndTime\", col(\"EndTime\").cast(\"timestamp\"))\n","df = df.withColumn(\"ConversationDate\", col(\"ConversationDate\").cast(\"timestamp\"))\n","\n","dfp = df.toPandas()\n","dfp = dfp.sample(frac=1) # Randomly shuffle the df\n","\n","# Following list are date weights from Today-0 to Today-13 (two weeks)\n","weights = [30, 26, 5, 5, 5, 5, 15, 2, 2, 1, 1, 1, 1, 1]\n","dfindex = 0 # index loop through all conversations\n","daysback = 0 # start at today and work backwards\n","\n","# Create a default time (e.g., noon) to use when NaT is encountered\n","default_time = datetime.strptime('12:00:00', '%H:%M:%S').time()\n","\n","for row in weights:\n"," numconvos = int((row/100.00) * df.count())\n"," for i in range(numconvos):\n"," # Handle NaT values by using default time when necessary\n"," start_time = dfp.at[dfindex, 'StartTime'].time() if pd.notna(dfp.at[dfindex, 'StartTime']) else default_time\n"," end_time = dfp.at[dfindex, 'EndTime'].time() if pd.notna(dfp.at[dfindex, 'EndTime']) else default_time\n"," conv_time = dfp.at[dfindex, 'ConversationDate'].time() if pd.notna(dfp.at[dfindex, 'ConversationDate']) else default_time\n"," \n"," # Combine dates with times\n"," dfp.at[dfindex, 'StartTime'] = datetime.combine(date.today() - timedelta(days=daysback), start_time)\n"," dfp.at[dfindex, 'EndTime'] = datetime.combine(date.today() - timedelta(days=daysback), end_time)\n"," dfp.at[dfindex, 'ConversationDate'] = datetime.combine(date.today() - timedelta(days=daysback), conv_time)\n"," \n"," dfindex += 1\n"," daysback += 1\n","\n","# Convert back to Spark DataFrame and save\n","df = spark.createDataFrame(dfp)\n","df.write.format('delta').mode('overwrite').option(\"overwriteSchema\", \"true\").saveAsTable('ckm_conv_processed_temp')\n","df = spark.sql(\"SELECT * FROM ckm_conv_processed_temp\")\n","df.write.format('delta').mode('overwrite').option(\"overwriteSchema\", \"false\").saveAsTable('ckm_conv_processed')"]}],"metadata":{"dependencies":{"lakehouse":{"default_lakehouse":"e6ad9dad-e3da-4da5-bca6-6572c466b69a","default_lakehouse_name":"ckm_lakehouse","default_lakehouse_workspace_id":"0d98d480-171b-4b4d-a8e7-80fbd031d1a6","known_lakehouses":[{"id":"e6ad9dad-e3da-4da5-bca6-6572c466b69a"}]}},"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"display_name":"Synapse PySpark","language":"Python","name":"synapse_pyspark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":" [email protected]"},"spark_compute":{"compute_id":"/trident/default"},"synapse_widget":{"state":{},"version":"0.1"},"widgets":{}},"nbformat":4,"nbformat_minor":5} |
0 commit comments