Skip to content

Commit 90e69f9

Browse files
authored
updated the notebooks to use a new synthetic data for demonstration; addressed PR comments #208 and #230 (#231)
1 parent 56b34e6 commit 90e69f9

File tree

6 files changed

+2621
-836
lines changed

6 files changed

+2621
-836
lines changed

tutorials/analyzers.ipynb

Lines changed: 177 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,116 @@
66
"source": [
77
"# Analyzers Basic Tutorial\n",
88
"\n",
9+
"__Updated June 2024 to use a new dataset__\n",
10+
"\n",
911
"This Jupyter notebook will give a basic tutorial on how to use PyDeequ's Analyzers module."
1012
]
1113
},
1214
{
1315
"cell_type": "code",
1416
"execution_count": 1,
15-
"metadata": {},
17+
"metadata": {
18+
"tags": []
19+
},
1620
"outputs": [],
21+
"source": [
22+
"import os\n",
23+
"# indicate your Spark version, here we use Spark 3.5 with pydeequ 1.4.0\n",
24+
"os.environ[\"SPARK_VERSION\"] = '3.5'"
25+
]
26+
},
27+
{
28+
"cell_type": "code",
29+
"execution_count": 2,
30+
"metadata": {
31+
"tags": []
32+
},
33+
"outputs": [
34+
{
35+
"name": "stdout",
36+
"output_type": "stream",
37+
"text": [
38+
":: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n"
39+
]
40+
},
41+
{
42+
"name": "stderr",
43+
"output_type": "stream",
44+
"text": [
45+
"Ivy Default Cache set to: /home/ec2-user/.ivy2/cache\n",
46+
"The jars for the packages stored in: /home/ec2-user/.ivy2/jars\n",
47+
"com.amazon.deequ#deequ added as a dependency\n",
48+
":: resolving dependencies :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9;1.0\n",
49+
"\tconfs: [default]\n",
50+
"\tfound com.amazon.deequ#deequ;2.0.3-spark-3.3 in central\n",
51+
"\tfound org.scala-lang#scala-reflect;2.12.10 in central\n",
52+
"\tfound org.scalanlp#breeze_2.12;0.13.2 in central\n",
53+
"\tfound org.scalanlp#breeze-macros_2.12;0.13.2 in central\n",
54+
"\tfound com.github.fommil.netlib#core;1.1.2 in central\n",
55+
"\tfound net.sf.opencsv#opencsv;2.3 in central\n",
56+
"\tfound com.github.rwl#jtransforms;2.4.0 in central\n",
57+
"\tfound junit#junit;4.8.2 in central\n",
58+
"\tfound org.apache.commons#commons-math3;3.2 in central\n",
59+
"\tfound org.spire-math#spire_2.12;0.13.0 in central\n",
60+
"\tfound org.spire-math#spire-macros_2.12;0.13.0 in central\n",
61+
"\tfound org.typelevel#machinist_2.12;0.6.1 in central\n",
62+
"\tfound com.chuusai#shapeless_2.12;2.3.2 in central\n",
63+
"\tfound org.typelevel#macro-compat_2.12;1.1.1 in central\n",
64+
"\tfound org.slf4j#slf4j-api;1.7.5 in central\n",
65+
":: resolution report :: resolve 435ms :: artifacts dl 12ms\n",
66+
"\t:: modules in use:\n",
67+
"\tcom.amazon.deequ#deequ;2.0.3-spark-3.3 from central in [default]\n",
68+
"\tcom.chuusai#shapeless_2.12;2.3.2 from central in [default]\n",
69+
"\tcom.github.fommil.netlib#core;1.1.2 from central in [default]\n",
70+
"\tcom.github.rwl#jtransforms;2.4.0 from central in [default]\n",
71+
"\tjunit#junit;4.8.2 from central in [default]\n",
72+
"\tnet.sf.opencsv#opencsv;2.3 from central in [default]\n",
73+
"\torg.apache.commons#commons-math3;3.2 from central in [default]\n",
74+
"\torg.scala-lang#scala-reflect;2.12.10 from central in [default]\n",
75+
"\torg.scalanlp#breeze-macros_2.12;0.13.2 from central in [default]\n",
76+
"\torg.scalanlp#breeze_2.12;0.13.2 from central in [default]\n",
77+
"\torg.slf4j#slf4j-api;1.7.5 from central in [default]\n",
78+
"\torg.spire-math#spire-macros_2.12;0.13.0 from central in [default]\n",
79+
"\torg.spire-math#spire_2.12;0.13.0 from central in [default]\n",
80+
"\torg.typelevel#machinist_2.12;0.6.1 from central in [default]\n",
81+
"\torg.typelevel#macro-compat_2.12;1.1.1 from central in [default]\n",
82+
"\t:: evicted modules:\n",
83+
"\torg.scala-lang#scala-reflect;2.12.1 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n",
84+
"\torg.scala-lang#scala-reflect;2.12.0 by [org.scala-lang#scala-reflect;2.12.10] in [default]\n",
85+
"\t---------------------------------------------------------------------\n",
86+
"\t| | modules || artifacts |\n",
87+
"\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n",
88+
"\t---------------------------------------------------------------------\n",
89+
"\t| default | 17 | 0 | 0 | 2 || 15 | 0 |\n",
90+
"\t---------------------------------------------------------------------\n",
91+
":: retrieving :: org.apache.spark#spark-submit-parent-23421fea-77b3-4d69-9251-54adf6371fd9\n",
92+
"\tconfs: [default]\n",
93+
"\t0 artifacts copied, 15 already retrieved (0kB/9ms)\n"
94+
]
95+
},
96+
{
97+
"name": "stdout",
98+
"output_type": "stream",
99+
"text": [
100+
"24/06/14 23:25:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
101+
]
102+
},
103+
{
104+
"name": "stderr",
105+
"output_type": "stream",
106+
"text": [
107+
"Setting default log level to \"WARN\".\n",
108+
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
109+
]
110+
},
111+
{
112+
"name": "stdout",
113+
"output_type": "stream",
114+
"text": [
115+
"24/06/14 23:25:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n"
116+
]
117+
}
118+
],
17119
"source": [
18120
"from pyspark.sql import SparkSession, Row, DataFrame\n",
19121
"import json\n",
@@ -36,14 +138,30 @@
36138
"cell_type": "markdown",
37139
"metadata": {},
38140
"source": [
39-
"### We will be using the Amazon Product Reviews dataset -- specifically the Electronics subset. "
141+
"### We will be using the synthetic reviews dataset for Electronics products"
40142
]
41143
},
42144
{
43145
"cell_type": "code",
44-
"execution_count": 2,
45-
"metadata": {},
146+
"execution_count": 3,
147+
"metadata": {
148+
"tags": []
149+
},
46150
"outputs": [
151+
{
152+
"name": "stdout",
153+
"output_type": "stream",
154+
"text": [
155+
"24/06/14 23:26:01 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties\n"
156+
]
157+
},
158+
{
159+
"name": "stderr",
160+
"output_type": "stream",
161+
"text": [
162+
" \r"
163+
]
164+
},
47165
{
48166
"name": "stdout",
49167
"output_type": "stream",
@@ -53,32 +171,46 @@
53171
" |-- customer_id: string (nullable = true)\n",
54172
" |-- review_id: string (nullable = true)\n",
55173
" |-- product_id: string (nullable = true)\n",
56-
" |-- product_parent: string (nullable = true)\n",
57174
" |-- product_title: string (nullable = true)\n",
58-
" |-- star_rating: integer (nullable = true)\n",
59-
" |-- helpful_votes: integer (nullable = true)\n",
60-
" |-- total_votes: integer (nullable = true)\n",
61-
" |-- vine: string (nullable = true)\n",
62-
" |-- verified_purchase: string (nullable = true)\n",
175+
" |-- star_rating: long (nullable = true)\n",
176+
" |-- helpful_votes: long (nullable = true)\n",
177+
" |-- total_votes: long (nullable = true)\n",
178+
" |-- insight: string (nullable = true)\n",
63179
" |-- review_headline: string (nullable = true)\n",
64180
" |-- review_body: string (nullable = true)\n",
65-
" |-- review_date: date (nullable = true)\n",
66-
" |-- year: integer (nullable = true)\n",
181+
" |-- review_date: timestamp (nullable = true)\n",
182+
" |-- review_year: long (nullable = true)\n",
67183
"\n"
68184
]
69185
}
70186
],
71187
"source": [
72-
"df = spark.read.parquet(\"s3a://amazon-reviews-pds/parquet/product_category=Electronics/\")\n",
188+
"df = spark.read.parquet(\"s3a://aws-bigdata-blog/generated_synthetic_reviews/data/product_category=Electronics/\")\n",
73189
"\n",
74190
"df.printSchema()"
75191
]
76192
},
77193
{
78194
"cell_type": "code",
79-
"execution_count": 3,
80-
"metadata": {},
195+
"execution_count": 4,
196+
"metadata": {
197+
"tags": []
198+
},
81199
"outputs": [
200+
{
201+
"name": "stdout",
202+
"output_type": "stream",
203+
"text": [
204+
"24/06/14 23:26:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n"
205+
]
206+
},
207+
{
208+
"name": "stderr",
209+
"output_type": "stream",
210+
"text": [
211+
" \r"
212+
]
213+
},
82214
{
83215
"name": "stdout",
84216
"output_type": "stream",
@@ -87,15 +219,23 @@
87219
"| entity| instance| name| value|\n",
88220
"+-----------+--------------------+-------------------+--------------------+\n",
89221
"| Column| review_id| Completeness| 1.0|\n",
90-
"| Column| review_id|ApproxCountDistinct| 3010972.0|\n",
91-
"|Mutlicolumn|total_votes,star_...| Correlation|-0.03451097996538765|\n",
92-
"| Dataset| *| Size| 3120938.0|\n",
93-
"| Column| star_rating| Mean| 4.036143941340712|\n",
94-
"| Column| top star_rating| Compliance| 0.7494070692849394|\n",
95-
"|Mutlicolumn|total_votes,helpf...| Correlation| 0.9936463809903863|\n",
222+
"| Column| review_id|ApproxCountDistinct| 3160409.0|\n",
223+
"|Mutlicolumn|total_votes,star_...| Correlation|-7.38808965018615...|\n",
224+
"| Dataset| *| Size| 3010972.0|\n",
225+
"| Column| star_rating| Mean| 3.9999973430506826|\n",
226+
"| Column| top star_rating| Compliance| 0.7499993357626706|\n",
227+
"|Mutlicolumn|total_votes,helpf...| Correlation| 0.9817922803462663|\n",
96228
"+-----------+--------------------+-------------------+--------------------+\n",
97229
"\n"
98230
]
231+
},
232+
{
233+
"name": "stderr",
234+
"output_type": "stream",
235+
"text": [
236+
"/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/dataframe.py:127: UserWarning: DataFrame constructor is internal. Do not directly use it.\n",
237+
" warnings.warn(\"DataFrame constructor is internal. Do not directly use it.\")\n"
238+
]
99239
}
100240
],
101241
"source": [
@@ -119,7 +259,9 @@
119259
{
120260
"cell_type": "code",
121261
"execution_count": 5,
122-
"metadata": {},
262+
"metadata": {
263+
"tags": []
264+
},
123265
"outputs": [
124266
{
125267
"data": {
@@ -161,42 +303,42 @@
161303
" <td>Column</td>\n",
162304
" <td>review_id</td>\n",
163305
" <td>ApproxCountDistinct</td>\n",
164-
" <td>3.010972e+06</td>\n",
306+
" <td>3.160409e+06</td>\n",
165307
" </tr>\n",
166308
" <tr>\n",
167309
" <th>2</th>\n",
168310
" <td>Mutlicolumn</td>\n",
169311
" <td>total_votes,star_rating</td>\n",
170312
" <td>Correlation</td>\n",
171-
" <td>-3.451098e-02</td>\n",
313+
" <td>-7.388090e-04</td>\n",
172314
" </tr>\n",
173315
" <tr>\n",
174316
" <th>3</th>\n",
175317
" <td>Dataset</td>\n",
176318
" <td>*</td>\n",
177319
" <td>Size</td>\n",
178-
" <td>3.120938e+06</td>\n",
320+
" <td>3.010972e+06</td>\n",
179321
" </tr>\n",
180322
" <tr>\n",
181323
" <th>4</th>\n",
182324
" <td>Column</td>\n",
183325
" <td>star_rating</td>\n",
184326
" <td>Mean</td>\n",
185-
" <td>4.036144e+00</td>\n",
327+
" <td>3.999997e+00</td>\n",
186328
" </tr>\n",
187329
" <tr>\n",
188330
" <th>5</th>\n",
189331
" <td>Column</td>\n",
190332
" <td>top star_rating</td>\n",
191333
" <td>Compliance</td>\n",
192-
" <td>7.494071e-01</td>\n",
334+
" <td>7.499993e-01</td>\n",
193335
" </tr>\n",
194336
" <tr>\n",
195337
" <th>6</th>\n",
196338
" <td>Mutlicolumn</td>\n",
197339
" <td>total_votes,helpful_votes</td>\n",
198340
" <td>Correlation</td>\n",
199-
" <td>9.936464e-01</td>\n",
341+
" <td>9.817923e-01</td>\n",
200342
" </tr>\n",
201343
" </tbody>\n",
202344
"</table>\n",
@@ -205,12 +347,12 @@
205347
"text/plain": [
206348
" entity instance name value\n",
207349
"0 Column review_id Completeness 1.000000e+00\n",
208-
"1 Column review_id ApproxCountDistinct 3.010972e+06\n",
209-
"2 Mutlicolumn total_votes,star_rating Correlation -3.451098e-02\n",
210-
"3 Dataset * Size 3.120938e+06\n",
211-
"4 Column star_rating Mean 4.036144e+00\n",
212-
"5 Column top star_rating Compliance 7.494071e-01\n",
213-
"6 Mutlicolumn total_votes,helpful_votes Correlation 9.936464e-01"
350+
"1 Column review_id ApproxCountDistinct 3.160409e+06\n",
351+
"2 Mutlicolumn total_votes,star_rating Correlation -7.388090e-04\n",
352+
"3 Dataset * Size 3.010972e+06\n",
353+
"4 Column star_rating Mean 3.999997e+00\n",
354+
"5 Column top star_rating Compliance 7.499993e-01\n",
355+
"6 Mutlicolumn total_votes,helpful_votes Correlation 9.817923e-01"
214356
]
215357
},
216358
"execution_count": 5,
@@ -247,7 +389,7 @@
247389
"name": "python",
248390
"nbconvert_exporter": "python",
249391
"pygments_lexer": "ipython3",
250-
"version": "3.6.10"
392+
"version": "3.10.14"
251393
}
252394
},
253395
"nbformat": 4,

0 commit comments

Comments
 (0)