Skip to content

Commit 3888429

Browse files
authored
Merge pull request #57 from JakeRadMSFT/u/jakerad/download-data
Update all Notebooks to download data when it's needed.
2 parents ab5aa15 + 4e7b425 commit 3888429

13 files changed

+845
-801
lines changed

machine-learning/02-Data Preparation and Feature Engineering.ipynb

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,57 @@
8686
],
8787
"outputs": []
8888
},
89+
{
90+
"cell_type": "markdown",
91+
"metadata": {},
92+
"source": [
93+
"## Download or Locate Data\n",
94+
"The following code tries to locate the data file in a few known locations or it will download it from the known GitHub location."
95+
]
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": 1,
100+
"metadata": {
101+
"dotnet_interactive": {
102+
"language": "csharp"
103+
}
104+
},
105+
"source": [
106+
"using System;\n",
107+
"using System.IO;\n",
108+
"using System.Net;\n",
109+
"\n",
110+
"string EnsureDataSetDownloaded(string fileName)\n",
111+
"{\n",
112+
"\n",
113+
"\t// This is the path if the repo has been checked out.\n",
114+
"\tvar filePath = Path.Combine(Directory.GetCurrentDirectory(),\"data\", fileName);\n",
115+
"\n",
116+
"\tif (!File.Exists(filePath))\n",
117+
"\t{\n",
118+
"\t\t// This is the path if the file has already been downloaded.\n",
119+
"\t\tfilePath = Path.Combine(Directory.GetCurrentDirectory(), fileName);\n",
120+
"\t}\n",
121+
"\n",
122+
"\tif (!File.Exists(filePath))\n",
123+
"\t{\n",
124+
"\t\tusing (var client = new WebClient())\n",
125+
"\t\t{\n",
126+
"\t\t\tclient.DownloadFile($\"https://raw.githubusercontent.com/dotnet/csharp-notebooks/main/machine-learning/data/{fileName}\", filePath);\n",
127+
"\t\t}\n",
128+
"\t\tConsole.WriteLine($\"Downloaded {fileName} to : {filePath}\");\n",
129+
"\t}\n",
130+
"\telse\n",
131+
"\t{\n",
132+
"\t\tConsole.WriteLine($\"{fileName} found here: {filePath}\");\n",
133+
"\t}\n",
134+
"\n",
135+
"\treturn filePath;\n",
136+
"}"
137+
],
138+
"outputs": []
139+
},
89140
{
90141
"cell_type": "markdown",
91142
"metadata": {},
@@ -188,12 +239,14 @@
188239
}
189240
},
190241
"source": [
242+
"var trainDataPath = EnsureDataSetDownloaded(\"taxi-fare.csv\");\n",
243+
"\n",
191244
"// Create TextLoader based on the Model Input type. \n",
192245
"TextLoader textLoader = mlContext.Data.CreateTextLoader<ModelInput>(separatorChar: ',', hasHeader: true);\n",
193246
"\n",
194247
"// Load the data into an IDataView. Load() method can support multiple files. \n",
195248
"// Files must they have the same separator character, header, column names, etc. \n",
196-
"IDataView data = textLoader.Load(\"data/taxi-fare.csv\");\n",
249+
"IDataView data = textLoader.Load(trainDataPath);\n",
197250
"\n",
198251
"data.Preview(1); "
199252
],

machine-learning/03-Training and AutoML.ipynb

Lines changed: 117 additions & 150 deletions
Large diffs are not rendered by default.

machine-learning/04-Model Evaluation.ipynb

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -138,9 +138,8 @@
138138
"cell_type": "markdown",
139139
"metadata": {},
140140
"source": [
141-
"### Load your data\n",
142-
"\n",
143-
"Use the `#!value` and `#!share` magic commands to fetch the data from GitHub, store it in the `taxi_data` variable and load it into a `DataFrame` "
141+
"### Download or Locate Data\n",
142+
"The following code tries to locate the data file in a few known locations or it will download it from the known GitHub location."
144143
]
145144
},
146145
{
@@ -152,22 +151,48 @@
152151
}
153152
},
154153
"source": [
155-
"#!value --name taxi_data --from-url https://github.com/dotnet/csharp-notebooks/raw/main/machine-learning/data/taxi-fare.csv"
154+
"using System;\n",
155+
"using System.IO;\n",
156+
"using System.Net;\n",
157+
"\n",
158+
"string EnsureDataSetDownloaded(string fileName)\n",
159+
"{\n",
160+
"\n",
161+
"\t// This is the path if the repo has been checked out.\n",
162+
"\tvar filePath = Path.Combine(Directory.GetCurrentDirectory(),\"data\", fileName);\n",
163+
"\n",
164+
"\tif (!File.Exists(filePath))\n",
165+
"\t{\n",
166+
"\t\t// This is the path if the file has already been downloaded.\n",
167+
"\t\tfilePath = Path.Combine(Directory.GetCurrentDirectory(), fileName);\n",
168+
"\t}\n",
169+
"\n",
170+
"\tif (!File.Exists(filePath))\n",
171+
"\t{\n",
172+
"\t\tusing (var client = new WebClient())\n",
173+
"\t\t{\n",
174+
"\t\t\tclient.DownloadFile($\"https://raw.githubusercontent.com/dotnet/csharp-notebooks/main/machine-learning/data/{fileName}\", filePath);\n",
175+
"\t\t}\n",
176+
"\t\tConsole.WriteLine($\"Downloaded {fileName} to : {filePath}\");\n",
177+
"\t}\n",
178+
"\telse\n",
179+
"\t{\n",
180+
"\t\tConsole.WriteLine($\"{fileName} found here: {filePath}\");\n",
181+
"\t}\n",
182+
"\n",
183+
"\treturn filePath;\n",
184+
"}"
156185
],
157-
"outputs": []
158-
},
159-
{
160-
"cell_type": "code",
161-
"execution_count": 1,
162-
"metadata": {
163-
"dotnet_interactive": {
164-
"language": "csharp"
186+
"outputs": [
187+
{
188+
"output_type": "execute_result",
189+
"data": {
190+
"text/plain": "Train Data Path: C:\\dev\\csharp-notebooks\\machine-learning\\data\\taxi-fare.csv\r\n"
191+
},
192+
"execution_count": 1,
193+
"metadata": {}
165194
}
166-
},
167-
"source": [
168-
"#!share taxi_data --from value"
169-
],
170-
"outputs": []
195+
]
171196
},
172197
{
173198
"cell_type": "code",
@@ -178,7 +203,8 @@
178203
}
179204
},
180205
"source": [
181-
"var df = DataFrame.LoadCsvFromString(taxi_data);"
206+
"var trainDataPath = EnsureDataSetDownloaded(\"taxi-fare.csv\");\n",
207+
"var df = DataFrame.LoadCsv(trainDataPath);"
182208
],
183209
"outputs": []
184210
},
@@ -204,7 +230,7 @@
204230
{
205231
"output_type": "execute_result",
206232
"data": {
207-
"text/html": "<table id=\"table_637928803371228110\"><thead><tr><th><i>index</i></th><th>vendor_id</th><th>rate_code</th><th>passenger_count</th><th>trip_time_in_secs</th><th>trip_distance</th><th>payment_type</th><th>fare_amount</th></tr></thead><tbody><tr><td><i><div class=\"dni-plaintext\">0</div></i></td><td>CMT</td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">1271</div></td><td><div class=\"dni-plaintext\">3.8</div></td><td>CRD</td><td><div class=\"dni-plaintext\">17.5</div></td></tr><tr><td><i><div class=\"dni-plaintext\">1</div></i></td><td>CMT</td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">474</div></td><td><div class=\"dni-plaintext\">1.5</div></td><td>CRD</td><td><div class=\"dni-plaintext\">8</div></td></tr><tr><td><i><div class=\"dni-plaintext\">2</div></i></td><td>CMT</td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">637</div></td><td><div class=\"dni-plaintext\">1.4</div></td><td>CRD</td><td><div class=\"dni-plaintext\">8.5</div></td></tr><tr><td><i><div class=\"dni-plaintext\">3</div></i></td><td>CMT</td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">181</div></td><td><div class=\"dni-plaintext\">0.6</div></td><td>CSH</td><td><div class=\"dni-plaintext\">4.5</div></td></tr><tr><td><i><div class=\"dni-plaintext\">4</div></i></td><td>CMT</td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">661</div></td><td><div class=\"dni-plaintext\">1.1</div></td><td>CRD</td><td><div class=\"dni-plaintext\">8.5</div></td></tr></tbody></table>"
233+
"text/html": "<table id=\"table_637934937843853168\"><thead><tr><th><i>index</i></th><th>vendor_id</th><th>rate_code</th><th>passenger_count</th><th>trip_time_in_secs</th><th>trip_distance</th><th>payment_type</th><th>fare_amount</th></tr></thead><tbody><tr><td><i><div class=\"dni-plaintext\">0</div></i></td><td>CMT</td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">1271</div></td><td><div class=\"dni-plaintext\">3.8</div></td><td>CRD</td><td><div class=\"dni-plaintext\">17.5</div></td></tr><tr><td><i><div class=\"dni-plaintext\">1</div></i></td><td>CMT</td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">474</div></td><td><div class=\"dni-plaintext\">1.5</div></td><td>CRD</td><td><div class=\"dni-plaintext\">8</div></td></tr><tr><td><i><div class=\"dni-plaintext\">2</div></i></td><td>CMT</td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">637</div></td><td><div class=\"dni-plaintext\">1.4</div></td><td>CRD</td><td><div class=\"dni-plaintext\">8.5</div></td></tr><tr><td><i><div class=\"dni-plaintext\">3</div></i></td><td>CMT</td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">181</div></td><td><div class=\"dni-plaintext\">0.6</div></td><td>CSH</td><td><div class=\"dni-plaintext\">4.5</div></td></tr><tr><td><i><div class=\"dni-plaintext\">4</div></i></td><td>CMT</td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">1</div></td><td><div class=\"dni-plaintext\">661</div></td><td><div class=\"dni-plaintext\">1.1</div></td><td>CRD</td><td><div class=\"dni-plaintext\">8.5</div></td></tr></tbody></table>"
208234
},
209235
"execution_count": 1,
210236
"metadata": {}

machine-learning/E2E-Classification with Iris Dataset.ipynb

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
{
6161
"output_type": "execute_result",
6262
"data": {
63-
"text/markdown": "Loading extensions from `Microsoft.Data.Analysis.Interactive.dll`"
63+
"text/markdown": "Loading extensions from `Microsoft.ML.AutoML.Interactive.dll`"
6464
},
6565
"execution_count": 1,
6666
"metadata": {}
@@ -76,7 +76,7 @@
7676
{
7777
"output_type": "execute_result",
7878
"data": {
79-
"text/markdown": "Loading extensions from `Microsoft.ML.AutoML.Interactive.dll`"
79+
"text/markdown": "Loading extensions from `Microsoft.Data.Analysis.Interactive.dll`"
8080
},
8181
"execution_count": 1,
8282
"metadata": {}
@@ -157,6 +157,57 @@
157157
],
158158
"outputs": []
159159
},
160+
{
161+
"cell_type": "markdown",
162+
"metadata": {},
163+
"source": [
164+
"## Download or Locate Data\n",
165+
"The following code tries to locate the data file in a few known locations or it will download it from the known GitHub location."
166+
]
167+
},
168+
{
169+
"cell_type": "code",
170+
"execution_count": 1,
171+
"metadata": {
172+
"dotnet_interactive": {
173+
"language": "csharp"
174+
}
175+
},
176+
"source": [
177+
"using System;\n",
178+
"using System.IO;\n",
179+
"using System.Net;\n",
180+
"\n",
181+
"string EnsureDataSetDownloaded(string fileName)\n",
182+
"{\n",
183+
"\n",
184+
"\t// This is the path if the repo has been checked out.\n",
185+
"\tvar filePath = Path.Combine(Directory.GetCurrentDirectory(),\"data\", fileName);\n",
186+
"\n",
187+
"\tif (!File.Exists(filePath))\n",
188+
"\t{\n",
189+
"\t\t// This is the path if the file has already been downloaded.\n",
190+
"\t\tfilePath = Path.Combine(Directory.GetCurrentDirectory(), fileName);\n",
191+
"\t}\n",
192+
"\n",
193+
"\tif (!File.Exists(filePath))\n",
194+
"\t{\n",
195+
"\t\tusing (var client = new WebClient())\n",
196+
"\t\t{\n",
197+
"\t\t\tclient.DownloadFile($\"https://raw.githubusercontent.com/dotnet/csharp-notebooks/main/machine-learning/data/{fileName}\", filePath);\n",
198+
"\t\t}\n",
199+
"\t\tConsole.WriteLine($\"Downloaded {fileName} to : {filePath}\");\n",
200+
"\t}\n",
201+
"\telse\n",
202+
"\t{\n",
203+
"\t\tConsole.WriteLine($\"{fileName} found here: {filePath}\");\n",
204+
"\t}\n",
205+
"\n",
206+
"\treturn filePath;\n",
207+
"}"
208+
],
209+
"outputs": []
210+
},
160211
{
161212
"cell_type": "markdown",
162213
"metadata": {},
@@ -177,8 +228,8 @@
177228
"var mlContext = new MLContext();\n",
178229
"\n",
179230
"// Define path to training data\n",
180-
"string trainValidateDataPath = @\".\\data\\iris-train.tsv\";\n",
181-
"string testDataPath = @\".\\data\\iris-test.tsv\";\n",
231+
"string trainValidateDataPath = EnsureDataSetDownloaded(\"iris-train.tsv\");\n",
232+
"string testDataPath = EnsureDataSetDownloaded(\"iris-test.tsv\");\n",
182233
"\n",
183234
"// Load data from a text file to an IDataView (a flexible, efficient way of describing tabular data)\n",
184235
"IDataView trainValidateData = mlContext.Data.LoadFromTextFile<ModelInput>(\n",
@@ -200,6 +251,14 @@
200251
""
201252
],
202253
"outputs": [
254+
{
255+
"output_type": "execute_result",
256+
"data": {
257+
"text/plain": "iris-test.tsv found here: C:\\dev\\csharp-notebooks\\machine-learning\\data\\iris-test.tsv\r\n"
258+
},
259+
"execution_count": 1,
260+
"metadata": {}
261+
},
203262
{
204263
"output_type": "execute_result",
205264
"data": {

machine-learning/E2E-Forecasting using Regression with Luna Dataset.ipynb

Lines changed: 65 additions & 5 deletions
Large diffs are not rendered by default.

machine-learning/E2E-Forecasting using SSA with Luna Dataset.ipynb

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,57 @@
116116
"In the code block below, we show how to load dataset into `DataFrame`."
117117
]
118118
},
119+
{
120+
"cell_type": "markdown",
121+
"metadata": {},
122+
"source": [
123+
"#### Download or Locate Data\n",
124+
"The following code tries to locate the data file in a few known locations or it will download it from the known GitHub location."
125+
]
126+
},
127+
{
128+
"cell_type": "code",
129+
"execution_count": 1,
130+
"metadata": {
131+
"dotnet_interactive": {
132+
"language": "csharp"
133+
}
134+
},
135+
"source": [
136+
"using System;\n",
137+
"using System.IO;\n",
138+
"using System.Net;\n",
139+
"\n",
140+
"string EnsureDataSetDownloaded(string fileName)\n",
141+
"{\n",
142+
"\n",
143+
"\t// This is the path if the repo has been checked out.\n",
144+
"\tvar filePath = Path.Combine(Directory.GetCurrentDirectory(),\"data\", fileName);\n",
145+
"\n",
146+
"\tif (!File.Exists(filePath))\n",
147+
"\t{\n",
148+
"\t\t// This is the path if the file has already been downloaded.\n",
149+
"\t\tfilePath = Path.Combine(Directory.GetCurrentDirectory(), fileName);\n",
150+
"\t}\n",
151+
"\n",
152+
"\tif (!File.Exists(filePath))\n",
153+
"\t{\n",
154+
"\t\tusing (var client = new WebClient())\n",
155+
"\t\t{\n",
156+
"\t\t\tclient.DownloadFile($\"https://raw.githubusercontent.com/dotnet/csharp-notebooks/main/machine-learning/data/{fileName}\", filePath);\n",
157+
"\t\t}\n",
158+
"\t\tConsole.WriteLine($\"Downloaded {fileName} to : {filePath}\");\n",
159+
"\t}\n",
160+
"\telse\n",
161+
"\t{\n",
162+
"\t\tConsole.WriteLine($\"{fileName} found here: {filePath}\");\n",
163+
"\t}\n",
164+
"\n",
165+
"\treturn filePath;\n",
166+
"}"
167+
],
168+
"outputs": []
169+
},
119170
{
120171
"cell_type": "code",
121172
"execution_count": 1,
@@ -125,7 +176,7 @@
125176
}
126177
},
127178
"source": [
128-
"var dataPath = @\"./data/Luna.csv\";\n",
179+
"var dataPath = EnsureDataSetDownloaded(@\"Luna.csv\");\n",
129180
"var df = DataFrame.LoadCsv(dataPath);\n",
130181
"var loads = df[\"load\"].Cast<float?>();"
131182
],

0 commit comments

Comments
 (0)