|
52 | 52 | }, |
53 | 53 | { |
54 | 54 | "cell_type": "code", |
55 | | - "execution_count": 7, |
| 55 | + "execution_count": null, |
56 | 56 | "id": "ac3e8958-5d9c-4e80-9a6f-fd343a3d4dd5", |
57 | 57 | "metadata": { |
58 | 58 | "tags": [] |
|
89 | 89 | "id": "2ba80b72-4efc-4369-9acc-525613671e7b", |
90 | 90 | "metadata": {}, |
91 | 91 | "source": [ |
92 | | - "On Avocado dataset (how original). If you cloned git repo, is in /data, else go Kaggle" |
| 92 | + "Predict average price, avocado dataset (how original). If you ggit cloned repo, is in /data, else go Kaggle" |
93 | 93 | ] |
94 | 94 | }, |
95 | 95 | { |
|
108 | 108 | }, |
109 | 109 | "source": [ |
110 | 110 | "*Quick desc / scope of dataset :* \n", |
111 | | - "No EDA, this exercise have been made a million times\n", |
| 111 | + "No EDA, this exercise have been made a million times \n", |
112 | 112 | "Years 2015 to 2018 \n", |
113 | 113 | "Two avocado types : organic or conventional \n", |
114 | 114 | "Region = region of consumption \n", |
|
117 | 117 | }, |
118 | 118 | { |
119 | 119 | "cell_type": "code", |
120 | | - "execution_count": 8, |
| 120 | + "execution_count": null, |
121 | 121 | "id": "888a85f7-5e40-4e90-8a35-3cb1435d1460", |
122 | 122 | "metadata": { |
123 | 123 | "tags": [] |
124 | 124 | }, |
125 | | - "outputs": [ |
126 | | - { |
127 | | - "name": "stdout", |
128 | | - "output_type": "stream", |
129 | | - "text": [ |
130 | | - "root\n", |
131 | | - " |-- _c0: integer (nullable = true)\n", |
132 | | - " |-- Date: timestamp (nullable = true)\n", |
133 | | - " |-- AveragePrice: double (nullable = true)\n", |
134 | | - " |-- Total Volume: double (nullable = true)\n", |
135 | | - " |-- 4046: double (nullable = true)\n", |
136 | | - " |-- 4225: double (nullable = true)\n", |
137 | | - " |-- 4770: double (nullable = true)\n", |
138 | | - " |-- Total Bags: double (nullable = true)\n", |
139 | | - " |-- Small Bags: double (nullable = true)\n", |
140 | | - " |-- Large Bags: double (nullable = true)\n", |
141 | | - " |-- XLarge Bags: double (nullable = true)\n", |
142 | | - " |-- type: string (nullable = true)\n", |
143 | | - " |-- year: integer (nullable = true)\n", |
144 | | - " |-- region: string (nullable = true)\n", |
145 | | - "\n", |
146 | | - "+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n", |
147 | | - "|_c0| Date|AveragePrice|Total Volume| 4046| 4225| 4770|Total Bags|Small Bags|Large Bags|XLarge Bags| type|year|region|\n", |
148 | | - "+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n", |
149 | | - "| 0|2015-12-27 00:00:00| 1.33| 64236.62|1036.74| 54454.85|48.16| 8696.87| 8603.62| 93.25| 0.0|conventional|2015|Albany|\n", |
150 | | - "| 1|2015-12-20 00:00:00| 1.35| 54876.98| 674.28| 44638.81|58.33| 9505.56| 9408.07| 97.49| 0.0|conventional|2015|Albany|\n", |
151 | | - "| 2|2015-12-13 00:00:00| 0.93| 118220.22| 794.7|109149.67|130.5| 8145.35| 8042.21| 103.14| 0.0|conventional|2015|Albany|\n", |
152 | | - "| 3|2015-12-06 00:00:00| 1.08| 78992.15| 1132.0| 71976.41|72.58| 5811.16| 5677.4| 133.76| 0.0|conventional|2015|Albany|\n", |
153 | | - "+---+-------------------+------------+------------+-------+---------+-----+----------+----------+----------+-----------+------------+----+------+\n", |
154 | | - "only showing top 4 rows\n", |
155 | | - "\n" |
156 | | - ] |
157 | | - } |
158 | | - ], |
| 125 | + "outputs": [], |
159 | 126 | "source": [ |
160 | 127 | "# Cache table/dataframe for re-usable table with .cache()\n", |
161 | 128 | "# caching operation takes place only when a Spark action (count, show, take or write) is also performed on the same dataframe\n", |
|
181 | 148 | }, |
182 | 149 | { |
183 | 150 | "cell_type": "code", |
184 | | - "execution_count": 9, |
| 151 | + "execution_count": null, |
185 | 152 | "id": "e0068bc2-270c-4e43-beeb-082b404ce297", |
186 | 153 | "metadata": { |
187 | 154 | "tags": [] |
|
201 | 168 | "id": "b840a5b1-8bd7-4c73-a8c9-133e4983e8dd", |
202 | 169 | "metadata": {}, |
203 | 170 | "source": [ |
204 | | - "- Steps differs a bit from sklearn. Search for 'transformers' and 'estimators'\n", |
| 171 | + "- Steps differs a bit from sklearn. Search for Spark 'transformers' and 'estimators'\n", |
205 | 172 | "- No EDA, has been done a million times on this dataset. \n", |
206 | 173 | "- Format data \n", |
207 | | - "-Feature creation from 'Date' : yy and mm \n", |
208 | | - "-Drop columns : Total Bags, Total Volume (strong corr with respective subcategories) ; could also be done in pipeline tho ?\n", |
209 | | - "- Pipeline (encode etc...) \n", |
210 | | - "-One hot encoding categorical 'region' (before that, use StringIndexer) \n", |
211 | | - "-Drop transformed columns: Date, region. Note : unlike scikit-learn col transf, pyspark adds new col when transforming \n", |
212 | | - "- Consolidate all remaining features in a single vector using VectorAssembler\n", |
213 | | - "- Scale numerical features using StandardScaler <- would be earlier in a sklearn pipeline\n", |
214 | | - "- Predict" |
| 174 | + "-Feature creation from 'Date' & 'Year' : yy and mm \n", |
| 175 | + "-Optional : Drop columns : Total Bags, Total Volume (strong corr with respective subcategories) \n", |
| 176 | + "- Build Pipeline (encode etc...) \n", |
| 177 | + "-StringIndexer to convert categorical in caetgory indices \n", |
| 178 | + "-One hot encoding categorical 'region' \n", |
| 179 | + "-VectorAssembler, used encoded features into a single vector \n", |
| 180 | + "-StandardScaler on features vector <- would be earlier in sklearn pipeline \n", |
| 181 | + "-define regressor (here, randomForest) \n", |
| 182 | + "-build Pipeline()\n", |
| 183 | + "- Simple model, no cv/search param" |
215 | 184 | ] |
216 | 185 | }, |
217 | 186 | { |
|
226 | 195 | }, |
227 | 196 | { |
228 | 197 | "cell_type": "code", |
229 | | - "execution_count": 10, |
| 198 | + "execution_count": null, |
230 | 199 | "id": "ea5b4865-062b-491a-bf10-1242d46d358c", |
231 | 200 | "metadata": {}, |
232 | | - "outputs": [ |
233 | | - { |
234 | | - "name": "stdout", |
235 | | - "output_type": "stream", |
236 | | - "text": [ |
237 | | - "+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n", |
238 | | - "|AveragePrice|Medium Size|Large Size|XLarge Size|Small Bags|Large Bags|XLarge Bags| type|year|region|Year Index|Month|\n", |
239 | | - "+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n", |
240 | | - "| 1.33| 1036.74| 54454.85| 48.16| 8603.62| 93.25| 0.0|conventional|2015|Albany| 15| 12|\n", |
241 | | - "| 1.35| 674.28| 44638.81| 58.33| 9408.07| 97.49| 0.0|conventional|2015|Albany| 15| 12|\n", |
242 | | - "| 0.93| 794.7| 109149.67| 130.5| 8042.21| 103.14| 0.0|conventional|2015|Albany| 15| 12|\n", |
243 | | - "| 1.08| 1132.0| 71976.41| 72.58| 5677.4| 133.76| 0.0|conventional|2015|Albany| 15| 12|\n", |
244 | | - "+------------+-----------+----------+-----------+----------+----------+-----------+------------+----+------+----------+-----+\n", |
245 | | - "only showing top 4 rows\n", |
246 | | - "\n" |
247 | | - ] |
248 | | - } |
249 | | - ], |
| 201 | + "outputs": [], |
250 | 202 | "source": [ |
251 | 203 | "# convert 'year' yyyy to yy (yyyy - 2000, since we have 2015-2018 values)\n", |
252 | 204 | "df = df.withColumn('Year Index', col('Year') - 2000)\n", |
|
276 | 228 | }, |
277 | 229 | { |
278 | 230 | "cell_type": "code", |
279 | | - "execution_count": 14, |
| 231 | + "execution_count": null, |
280 | 232 | "id": "382272ea-07aa-43a4-af0f-681b332af34d", |
281 | 233 | "metadata": {}, |
282 | 234 | "outputs": [], |
|
330 | 282 | "id": "c3332499-66a1-4f79-be00-bcefcbda212a", |
331 | 283 | "metadata": {}, |
332 | 284 | "source": [ |
333 | | - "Crude attempt, no cv, some default rf parameters. \n", |
| 285 | + "Crude attempt, no cv, some arbitrary randomForest parameters. \n", |
334 | 286 | "For parameters tuning, look up for pyspark.ml.tuning / CrossValidator, ParamGridBuilder. Not used here" |
335 | 287 | ] |
336 | 288 | }, |
337 | 289 | { |
338 | 290 | "cell_type": "code", |
339 | | - "execution_count": 18, |
| 291 | + "execution_count": null, |
340 | 292 | "id": "ae2ebec7-8379-45bd-b375-faac5c64824c", |
341 | 293 | "metadata": { |
342 | 294 | "tags": [] |
343 | 295 | }, |
344 | | - "outputs": [ |
345 | | - { |
346 | | - "data": { |
347 | | - "text/plain": [ |
348 | | - "0.1975694758480664" |
349 | | - ] |
350 | | - }, |
351 | | - "execution_count": 18, |
352 | | - "metadata": {}, |
353 | | - "output_type": "execute_result" |
354 | | - } |
355 | | - ], |
| 296 | + "outputs": [], |
356 | 297 | "source": [ |
357 | 298 | "from pyspark.ml.evaluation import RegressionEvaluator\n", |
358 | 299 | "\n", |
|
365 | 306 | "\n", |
366 | 307 | "# apply the model to the test set\n", |
367 | 308 | "prediction = model.transform(test)\n", |
368 | | - "eval = RegressionEvaluator(predictionCol='prediction',\n", |
| 309 | + "eval_ = RegressionEvaluator(predictionCol='prediction',\n", |
369 | 310 | " labelCol='AveragePrice', metricName='rmse')\n", |
370 | 311 | "\n", |
371 | | - "eval.evaluate(prediction)" |
| 312 | + "eval_.evaluate(prediction)" |
372 | 313 | ] |
373 | 314 | }, |
374 | 315 | { |
375 | 316 | "cell_type": "markdown", |
376 | 317 | "id": "5a769698-04bc-4eda-9edc-63a4bfd11d25", |
377 | 318 | "metadata": {}, |
378 | 319 | "source": [ |
379 | | - "For reference, original article, using Linear regression + cv : rmse of .28" |
| 320 | + "For reference, original article, using Linear regression + cv/gridSearch : rmse of .28" |
380 | 321 | ] |
381 | 322 | } |
382 | 323 | ], |
|
0 commit comments