|
6 | 6 | "source": [ |
7 | 7 | "# Analyzers Basic Tutorial\n", |
8 | 8 | "\n", |
| 9 | + "__Updated June 2024 to use a new dataset__\n", |
| 10 | + "\n", |
9 | 11 | "This Jupyter notebook will give a basic tutorial on how to use PyDeequ's Analyzers module." |
10 | 12 | ] |
11 | 13 | }, |
12 | 14 | { |
13 | 15 | "cell_type": "code", |
14 | 16 | "execution_count": 1, |
15 | | - "metadata": {}, |
| 17 | + "metadata": { |
| 18 | + "tags": [] |
| 19 | + }, |
16 | 20 | "outputs": [], |
| 21 | + "source": [ |
| 22 | + "import os\n", |
| 23 | + "# indicate your Spark version, here we use Spark 3.5 with pydeequ 1.4.0\n", |
| 24 | + "os.environ[\"SPARK_VERSION\"] = '3.5'" |
| 25 | + ] |
| 26 | + }, |
| 27 | + { |
| 28 | + "cell_type": "code", |
| 29 | + "execution_count": 2, |
| 30 | + "metadata": { |
| 31 | + "tags": [] |
| 32 | + }, |
| 33 | + "outputs": [ |
| 34 | + { |
| 35 | + "name": "stdout", |
| 36 | + "output_type": "stream", |
| 37 | + "text": [ |
| 38 | + ":: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" |
| 39 | + ] |
| 40 | + }, |
| 41 | + { |
| 42 | + "name": "stderr", |
| 43 | + "output_type": "stream", |
| 44 | + "text": [ |
| 45 | + "Ivy Default Cache set to: /home/ec2-user/.ivy2/cache\n", |
| 46 | + "The jars for the packages stored in: /home/ec2-user/.ivy2/jars\n", |
| 47 | + "com.amazon.deequ#deequ added as a dependency\n", |
| 48 | + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9;1.0\n", |
| 49 | + "\tconfs: [default]\n", |
| 50 | + "\tfound com.amazon.deequ#deequ;2.0.3-spark-3.3 in central\n", |
| 51 | + "\tfound org.scala-lang#scala-reflect;2.12.10 in central\n", |
| 52 | + "\tfound org.scalanlp#breeze_2.12;0.13.2 in central\n", |
| 53 | + "\tfound org.scalanlp#breeze-macros_2.12;0.13.2 in central\n", |
| 54 | + "\tfound com.github.fommil.netlib#core;1.1.2 in central\n", |
| 55 | + "\tfound net.sf.opencsv#opencsv;2.3 in central\n", |
| 56 | + "\tfound com.github.rwl#jtransforms;2.4.0 in central\n", |
| 57 | + "\tfound junit#junit;4.8.2 in central\n", |
| 58 | + "\tfound org.apache.commons#commons-math3;3.2 in central\n", |
| 59 | + "\tfound org.spire-math#spire_2.12;0.13.0 in central\n", |
| 60 | + "\tfound org.spire-math#spire-macros_2.12;0.13.0 in central\n", |
| 61 | + "\tfound org.typelevel#machinist_2.12;0.6.1 in central\n", |
| 62 | + "\tfound com.chuusai#shapeless_2.12;2.3.2 in central\n", |
| 63 | + "\tfound org.typelevel#macro-compat_2.12;1.1.1 in central\n", |
| 64 | + "\tfound org.slf4j#slf4j-api;1.7.5 in central\n", |
| 65 | + ":: resolution report :: resolve 435ms :: artifacts dl 12ms\n", |
| 66 | + "\t:: modules in use:\n", |
| 67 | + "\tcom.amazon.deequ#deequ;2.0.3-spark-3.3 from central in [default]\n", |
| 68 | + "\tcom.chuusai#shapeless_2.12;2.3.2 from central in [default]\n", |
| 69 | + "\tcom.github.fommil.netlib#core;1.1.2 from central in [default]\n", |
| 70 | + "\tcom.github.rwl#jtransforms;2.4.0 from central in [default]\n", |
| 71 | + "\tjunit#junit;4.8.2 from central in [default]\n", |
| 72 | + "\tnet.sf.opencsv#opencsv;2.3 from central in [default]\n", |
| 73 | + "\torg.apache.commons#commons-math3;3.2 from central in [default]\n", |
| 74 | + "\torg.scala-lang#scala-reflect;2.12.10 from central in [default]\n", |
| 75 | + "\torg.scalanlp#breeze-macros_2.12;0.13.2 from central in [default]\n", |
| 76 | + "\torg.scalanlp#breeze_2.12;0.13.2 from central in [default]\n", |
| 77 | + "\torg.slf4j#slf4j-api;1.7.5 from central in [default]\n", |
| 78 | + "\torg.spire-math#spire-macros_2.12;0.13.0 from central in [default]\n", |
| 79 | + "\torg.spire-math#spire_2.12;0.13.0 from central in [default]\n", |
| 80 | + "\torg.typelevel#machinist_2.12;0.6.1 from central in [default]\n", |
| 81 | + "\torg.typelevel#macro-compat_2.12;1.1.1 from central in [default]\n", |
| 82 | + "\t:: evicted modules:\n", |
| 83 | + "\torg.scala-lang#scala-reflect;2.12.1 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", |
| 84 | + "\torg.scala-lang#scala-reflect;2.12.0 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n", |
| 85 | + "\t---------------------------------------------------------------------\n", |
| 86 | + "\t| | modules || artifacts |\n", |
| 87 | + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", |
| 88 | + "\t---------------------------------------------------------------------\n", |
| 89 | + "\t| default | 17 | 0 | 0 | 2 || 15 | 0 |\n", |
| 90 | + "\t---------------------------------------------------------------------\n", |
| 91 | + ":: retrieving :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9\n", |
| 92 | + "\tconfs: [default]\n", |
| 93 | + "\t0 artifacts copied, 15 already retrieved (0kB/9ms)\n" |
| 94 | + ] |
| 95 | + }, |
| 96 | + { |
| 97 | + "name": "stdout", |
| 98 | + "output_type": "stream", |
| 99 | + "text": [ |
| 100 | + "24/06/14 23:25:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" |
| 101 | + ] |
| 102 | + }, |
| 103 | + { |
| 104 | + "name": "stderr", |
| 105 | + "output_type": "stream", |
| 106 | + "text": [ |
| 107 | + "Setting default log level to \"WARN\".\n", |
| 108 | + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" |
| 109 | + ] |
| 110 | + }, |
| 111 | + { |
| 112 | + "name": "stdout", |
| 113 | + "output_type": "stream", |
| 114 | + "text": [ |
| 115 | + "24/06/14 23:25:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n" |
| 116 | + ] |
| 117 | + } |
| 118 | + ], |
17 | 119 | "source": [ |
18 | 120 | "from pyspark.sql import SparkSession, Row, DataFrame\n", |
19 | 121 | "import json\n", |
|
36 | 138 | "cell_type": "markdown", |
37 | 139 | "metadata": {}, |
38 | 140 | "source": [ |
39 | | - "### We will be using the Amazon Product Reviews dataset -- specifically the Electronics subset. " |
| 141 | + "### We will be using the synthetic reviews dataset for Electronics products" |
40 | 142 | ] |
41 | 143 | }, |
42 | 144 | { |
43 | 145 | "cell_type": "code", |
44 | | - "execution_count": 2, |
45 | | - "metadata": {}, |
| 146 | + "execution_count": 3, |
| 147 | + "metadata": { |
| 148 | + "tags": [] |
| 149 | + }, |
46 | 150 | "outputs": [ |
| 151 | + { |
| 152 | + "name": "stdout", |
| 153 | + "output_type": "stream", |
| 154 | + "text": [ |
| 155 | + "24/06/14 23:26:01 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n" |
| 156 | + ] |
| 157 | + }, |
| 158 | + { |
| 159 | + "name": "stderr", |
| 160 | + "output_type": "stream", |
| 161 | + "text": [ |
| 162 | + " \r" |
| 163 | + ] |
| 164 | + }, |
47 | 165 | { |
48 | 166 | "name": "stdout", |
49 | 167 | "output_type": "stream", |
|
53 | 171 | " |-- customer_id: string (nullable = true)\n", |
54 | 172 | " |-- review_id: string (nullable = true)\n", |
55 | 173 | " |-- product_id: string (nullable = true)\n", |
56 | | - " |-- product_parent: string (nullable = true)\n", |
57 | 174 | " |-- product_title: string (nullable = true)\n", |
58 | | - " |-- star_rating: integer (nullable = true)\n", |
59 | | - " |-- helpful_votes: integer (nullable = true)\n", |
60 | | - " |-- total_votes: integer (nullable = true)\n", |
61 | | - " |-- vine: string (nullable = true)\n", |
62 | | - " |-- verified_purchase: string (nullable = true)\n", |
| 175 | + " |-- star_rating: long (nullable = true)\n", |
| 176 | + " |-- helpful_votes: long (nullable = true)\n", |
| 177 | + " |-- total_votes: long (nullable = true)\n", |
| 178 | + " |-- insight: string (nullable = true)\n", |
63 | 179 | " |-- review_headline: string (nullable = true)\n", |
64 | 180 | " |-- review_body: string (nullable = true)\n", |
65 | | - " |-- review_date: date (nullable = true)\n", |
66 | | - " |-- year: integer (nullable = true)\n", |
| 181 | + " |-- review_date: timestamp (nullable = true)\n", |
| 182 | + " |-- review_year: long (nullable = true)\n", |
67 | 183 | "\n" |
68 | 184 | ] |
69 | 185 | } |
70 | 186 | ], |
71 | 187 | "source": [ |
72 | | - "df = spark.read.parquet(\"s3a://amazon-reviews-pds/parquet/product_category=Electronics/\")\n", |
| 188 | + "df = spark.read.parquet(\"s3a://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Electronics/\")\n", |
73 | 189 | "\n", |
74 | 190 | "df.printSchema()" |
75 | 191 | ] |
76 | 192 | }, |
77 | 193 | { |
78 | 194 | "cell_type": "code", |
79 | | - "execution_count": 3, |
80 | | - "metadata": {}, |
| 195 | + "execution_count": 4, |
| 196 | + "metadata": { |
| 197 | + "tags": [] |
| 198 | + }, |
81 | 199 | "outputs": [ |
| 200 | + { |
| 201 | + "name": "stdout", |
| 202 | + "output_type": "stream", |
| 203 | + "text": [ |
| 204 | + "24/06/14 23:26:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" |
| 205 | + ] |
| 206 | + }, |
| 207 | + { |
| 208 | + "name": "stderr", |
| 209 | + "output_type": "stream", |
| 210 | + "text": [ |
| 211 | + " \r" |
| 212 | + ] |
| 213 | + }, |
82 | 214 | { |
83 | 215 | "name": "stdout", |
84 | 216 | "output_type": "stream", |
|
87 | 219 | "| entity| instance| name| value|\n", |
88 | 220 | "+-----------+--------------------+-------------------+--------------------+\n", |
89 | 221 | "| Column| review_id| Completeness| 1.0|\n", |
90 | | - "| Column| review_id|ApproxCountDistinct| 3010972.0|\n", |
91 | | - "|Mutlicolumn|total_votes,star_...| Correlation|-0.03451097996538765|\n", |
92 | | - "| Dataset| *| Size| 3120938.0|\n", |
93 | | - "| Column| star_rating| Mean| 4.036143941340712|\n", |
94 | | - "| Column| top star_rating| Compliance| 0.7494070692849394|\n", |
95 | | - "|Mutlicolumn|total_votes,helpf...| Correlation| 0.9936463809903863|\n", |
| 222 | + "| Column| review_id|ApproxCountDistinct| 3160409.0|\n", |
| 223 | + "|Mutlicolumn|total_votes,star_...| Correlation|-7.38808965018615...|\n", |
| 224 | + "| Dataset| *| Size| 3010972.0|\n", |
| 225 | + "| Column| star_rating| Mean| 3.9999973430506826|\n", |
| 226 | + "| Column| top star_rating| Compliance| 0.7499993357626706|\n", |
| 227 | + "|Mutlicolumn|total_votes,helpf...| Correlation| 0.9817922803462663|\n", |
96 | 228 | "+-----------+--------------------+-------------------+--------------------+\n", |
97 | 229 | "\n" |
98 | 230 | ] |
| 231 | + }, |
| 232 | + { |
| 233 | + "name": "stderr", |
| 234 | + "output_type": "stream", |
| 235 | + "text": [ |
| 236 | + "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/dataframe.py:127: UserWarning: DataFrame constructor is internal. Do not directly use it.\n", |
| 237 | + " warnings.warn(\"DataFrame constructor is internal. Do not directly use it.\")\n" |
| 238 | + ] |
99 | 239 | } |
100 | 240 | ], |
101 | 241 | "source": [ |
|
119 | 259 | { |
120 | 260 | "cell_type": "code", |
121 | 261 | "execution_count": 5, |
122 | | - "metadata": {}, |
| 262 | + "metadata": { |
| 263 | + "tags": [] |
| 264 | + }, |
123 | 265 | "outputs": [ |
124 | 266 | { |
125 | 267 | "data": { |
|
161 | 303 | " <td>Column</td>\n", |
162 | 304 | " <td>review_id</td>\n", |
163 | 305 | " <td>ApproxCountDistinct</td>\n", |
164 | | - " <td>3.010972e+06</td>\n", |
| 306 | + " <td>3.160409e+06</td>\n", |
165 | 307 | " </tr>\n", |
166 | 308 | " <tr>\n", |
167 | 309 | " <th>2</th>\n", |
168 | 310 | " <td>Mutlicolumn</td>\n", |
169 | 311 | " <td>total_votes,star_rating</td>\n", |
170 | 312 | " <td>Correlation</td>\n", |
171 | | - " <td>-3.451098e-02</td>\n", |
| 313 | + " <td>-7.388090e-04</td>\n", |
172 | 314 | " </tr>\n", |
173 | 315 | " <tr>\n", |
174 | 316 | " <th>3</th>\n", |
175 | 317 | " <td>Dataset</td>\n", |
176 | 318 | " <td>*</td>\n", |
177 | 319 | " <td>Size</td>\n", |
178 | | - " <td>3.120938e+06</td>\n", |
| 320 | + " <td>3.010972e+06</td>\n", |
179 | 321 | " </tr>\n", |
180 | 322 | " <tr>\n", |
181 | 323 | " <th>4</th>\n", |
182 | 324 | " <td>Column</td>\n", |
183 | 325 | " <td>star_rating</td>\n", |
184 | 326 | " <td>Mean</td>\n", |
185 | | - " <td>4.036144e+00</td>\n", |
| 327 | + " <td>3.999997e+00</td>\n", |
186 | 328 | " </tr>\n", |
187 | 329 | " <tr>\n", |
188 | 330 | " <th>5</th>\n", |
189 | 331 | " <td>Column</td>\n", |
190 | 332 | " <td>top star_rating</td>\n", |
191 | 333 | " <td>Compliance</td>\n", |
192 | | - " <td>7.494071e-01</td>\n", |
| 334 | + " <td>7.499993e-01</td>\n", |
193 | 335 | " </tr>\n", |
194 | 336 | " <tr>\n", |
195 | 337 | " <th>6</th>\n", |
196 | 338 | " <td>Mutlicolumn</td>\n", |
197 | 339 | " <td>total_votes,helpful_votes</td>\n", |
198 | 340 | " <td>Correlation</td>\n", |
199 | | - " <td>9.936464e-01</td>\n", |
| 341 | + " <td>9.817923e-01</td>\n", |
200 | 342 | " </tr>\n", |
201 | 343 | " </tbody>\n", |
202 | 344 | "</table>\n", |
|
205 | 347 | "text/plain": [ |
206 | 348 | " entity instance name value\n", |
207 | 349 | "0 Column review_id Completeness 1.000000e+00\n", |
208 | | - "1 Column review_id ApproxCountDistinct 3.010972e+06\n", |
209 | | - "2 Mutlicolumn total_votes,star_rating Correlation -3.451098e-02\n", |
210 | | - "3 Dataset * Size 3.120938e+06\n", |
211 | | - "4 Column star_rating Mean 4.036144e+00\n", |
212 | | - "5 Column top star_rating Compliance 7.494071e-01\n", |
213 | | - "6 Mutlicolumn total_votes,helpful_votes Correlation 9.936464e-01" |
| 350 | + "1 Column review_id ApproxCountDistinct 3.160409e+06\n", |
| 351 | + "2 Mutlicolumn total_votes,star_rating Correlation -7.388090e-04\n", |
| 352 | + "3 Dataset * Size 3.010972e+06\n", |
| 353 | + "4 Column star_rating Mean 3.999997e+00\n", |
| 354 | + "5 Column top star_rating Compliance 7.499993e-01\n", |
| 355 | + "6 Mutlicolumn total_votes,helpful_votes Correlation 9.817923e-01" |
214 | 356 | ] |
215 | 357 | }, |
216 | 358 | "execution_count": 5, |
|
247 | 389 | "name": "python", |
248 | 390 | "nbconvert_exporter": "python", |
249 | 391 | "pygments_lexer": "ipython3", |
250 | | - "version": "3.6.10" |
| 392 | + "version": "3.10.14" |
251 | 393 | } |
252 | 394 | }, |
253 | 395 | "nbformat": 4, |
|
0 commit comments