Skip to content

Commit ebe8e55

Browse files
committed
changes
1 parent 5f6315c commit ebe8e55

File tree

13 files changed

+3057
-0
lines changed

13 files changed

+3057
-0
lines changed
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
Copyright (c) 2024 Oracle and/or its affiliates.
2+
3+
The Universal Permissive License (UPL), Version 1.0
4+
5+
Subject to the condition set forth below, permission is hereby granted to any
6+
person obtaining a copy of this software, associated documentation and/or data
7+
(collectively the "Software"), free of charge and under any and all copyright
8+
rights in the Software, and any and all patent rights owned or freely
9+
licensable by each licensor hereunder covering either (i) the unmodified
10+
Software as contributed to or provided by such licensor, or (ii) the Larger
11+
Works (as defined below), to deal in both
12+
13+
(a) the Software, and
14+
(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
15+
one is included with the Software (each a "Larger Work" to which the Software
16+
is contributed by such licensors),
17+
18+
without restriction, including without limitation the rights to copy, create
19+
derivative works of, display, perform, and distribute the Software and make,
20+
use, sell, offer for sale, import, export, have made, and have sold the
21+
Software and the Larger Work(s), and to sublicense the foregoing rights on
22+
either these or other terms.
23+
24+
This license is subject to the following condition:
25+
The above copyright notice and either this complete permission notice or at
26+
a minimum a reference to the UPL must be included in all copies or
27+
substantial portions of the Software.
28+
29+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
34+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
35+
SOFTWARE.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# OCI Data Flow Connection to AWS S3
2+
3+
Reviewed: 10.07.2024
4+
5+
# When to use this asset?
6+
7+
When using OCI Data Flow and you have to query from and/or push data to AWS S3.
8+
9+
# How to use this asset?
10+
11+
- Review the code in the notebook and add the code to your personal OCI Data Flow application.
12+
- Add your S3 credentials, ensure you are authenticated on AWS.
13+
14+
# License
15+
16+
Copyright (c) 2024 Oracle and/or its affiliates.
17+
18+
Licensed under the Universal Permissive License (UPL), Version 1.0.
19+
20+
See [LICENSE](https://github.com/oracle-devrel/technology-engineering/blob/main/LICENSE) for more details.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "398c4ebf-d456-4d3d-ba18-cd431c3b0290",
6+
"metadata": {},
7+
"source": [
8+
"### OCI Data Science - Useful Tips\n",
9+
"<details>\n",
10+
"<summary><font size=\"2\">Check for Public Internet Access</font></summary>\n",
11+
"\n",
12+
"```python\n",
13+
"import requests\n",
14+
"response = requests.get(\"https://oracle.com\")\n",
15+
"assert response.status_code==200, \"Internet connection failed\"\n",
16+
"```\n",
17+
"</details>\n",
18+
"<details>\n",
19+
"<summary><font size=\"2\">Helpful Documentation </font></summary>\n",
20+
"<ul><li><a href=\"https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm\">Data Science Service Documentation</a></li>\n",
21+
"<li><a href=\"https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html\">ADS documentation</a></li>\n",
22+
"</ul>\n",
23+
"</details>\n",
24+
"<details>\n",
25+
"<summary><font size=\"2\">Typical Cell Imports and Settings for ADS</font></summary>\n",
26+
"\n",
27+
"```python\n",
28+
"%load_ext autoreload\n",
29+
"%autoreload 2\n",
30+
"%matplotlib inline\n",
31+
"\n",
32+
"import warnings\n",
33+
"warnings.filterwarnings('ignore')\n",
34+
"\n",
35+
"import logging\n",
36+
"logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)\n",
37+
"\n",
38+
"import ads\n",
39+
"from ads.dataset.factory import DatasetFactory\n",
40+
"from ads.automl.provider import OracleAutoMLProvider\n",
41+
"from ads.automl.driver import AutoML\n",
42+
"from ads.evaluations.evaluator import ADSEvaluator\n",
43+
"from ads.common.data import ADSData\n",
44+
"from ads.explanations.explainer import ADSExplainer\n",
45+
"from ads.explanations.mlx_global_explainer import MLXGlobalExplainer\n",
46+
"from ads.explanations.mlx_local_explainer import MLXLocalExplainer\n",
47+
"from ads.catalog.model import ModelCatalog\n",
48+
"from ads.common.model_artifact import ModelArtifact\n",
49+
"```\n",
50+
"</details>\n",
51+
"<details>\n",
52+
"<summary><font size=\"2\">Useful Environment Variables</font></summary>\n",
53+
"\n",
54+
"```python\n",
55+
"import os\n",
56+
"print(os.environ[\"NB_SESSION_COMPARTMENT_OCID\"])\n",
57+
"print(os.environ[\"PROJECT_OCID\"])\n",
58+
"print(os.environ[\"USER_OCID\"])\n",
59+
"print(os.environ[\"TENANCY_OCID\"])\n",
60+
"print(os.environ[\"NB_REGION\"])\n",
61+
"```\n",
62+
"</details>"
63+
]
64+
},
65+
{
66+
"cell_type": "code",
67+
"execution_count": 63,
68+
"id": "7af22214-e7b7-4a66-90be-85f356523a3b",
69+
"metadata": {
70+
"tags": []
71+
},
72+
"outputs": [],
73+
"source": [
74+
"from pyspark.sql import SparkSession\n",
75+
"from pyspark.conf import SparkConf\n",
76+
"\n",
77+
"\n",
78+
"# Create a Spark session with your AWS Credentials\n",
79+
"\n",
80+
"conf = (\n",
81+
" SparkConf()\n",
82+
" .setAppName(\"MY_APP\") # replace with your desired name\n",
83+
" .set(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:3.2.1,com.amazonaws:aws-java-sdk-s3:1.11.655,com.amazonaws:aws-java-sdk-core:1.11.655,org.apache.spark:spark-hadoop-cloud_2.12:3.2.1\")\n",
84+
" .set(\"spark.hadoop.fs.s3a.access.key\", \"\")\n",
85+
" .set(\"spark.hadoop.fs.s3a.secret.key\", \"\")\n",
86+
" .set(\"spark.hadoop.fs.s3.impl\", \"org.apache.hadoop.fs.s3a.S3AFileSystem\")\n",
87+
")\n",
88+
"\n",
89+
"spark = SparkSession.builder.config(conf=conf).getOrCreate()\n"
90+
]
91+
},
92+
{
93+
"cell_type": "code",
94+
"execution_count": 54,
95+
"id": "2d735772-887d-45ee-9437-a5738701c0d2",
96+
"metadata": {
97+
"tags": []
98+
},
99+
"outputs": [],
100+
"source": [
101+
"spark.conf.set('spark.hadoop.fs.s3a.access.key', '')\n",
102+
"spark.conf.set('spark.hadoop.fs.s3a.secret.key', '')\n",
103+
"spark.conf.set('spark.hadoop.fs.s3a.path.style.access', 'true')\n",
104+
"spark.conf.set(\"spark.hadoop.fs.s3a.impl\",\"org.apache.hadoop.fs.s3a.S3AFileSystem\")\n",
105+
" "
106+
]
107+
},
108+
{
109+
"cell_type": "code",
110+
"execution_count": 64,
111+
"id": "bf8ecf4c-6943-4f28-9fc7-856ab30841b6",
112+
"metadata": {
113+
"tags": []
114+
},
115+
"outputs": [
116+
{
117+
"data": {
118+
"text/plain": [
119+
"'org.apache.hadoop.fs.s3a.S3AFileSystem'"
120+
]
121+
},
122+
"execution_count": 64,
123+
"metadata": {},
124+
"output_type": "execute_result"
125+
}
126+
],
127+
"source": [
128+
"spark.conf.get('spark.hadoop.fs.s3.impl')"
129+
]
130+
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": 65,
134+
"id": "c413d697-c749-4416-a95a-7833fe3317af",
135+
"metadata": {
136+
"tags": []
137+
},
138+
"outputs": [
139+
{
140+
"name": "stdout",
141+
"output_type": "stream",
142+
"text": [
143+
"Reading data from object store\n",
144+
"+----------+--------+----+---------------------+-------------------+--------------+\n",
145+
"| DATE_KEY|PRESSURE| RPM|OPERATING_TEMPERATURE|BEARING_TEMPERATURE|MACHINE_STATUS|\n",
146+
"+----------+--------+----+---------------------+-------------------+--------------+\n",
147+
"|07.08.2016| 3700|5715| 84| 57| 0|\n",
148+
"|09.08.2016| 3315|5582| 116| 69| 0|\n",
149+
"|09.08.2016| 3179|2471| 82| 67| 0|\n",
150+
"|07.01.2017| 4280|4793| 80,66| 71| 1|\n",
151+
"|07.01.2017| 4480|3086| 120| 71| 1|\n",
152+
"|07.01.2017| 4280|2522| 94,6| 76,86| 1|\n",
153+
"|08.01.2017| 4320|4732| 121,98| 59,36| 1|\n",
154+
"|08.01.2017| 4200|3105| 112| 68,88| 1|\n",
155+
"|08.01.2017| 4640|4436| 119| 76,88| 1|\n",
156+
"|08.01.2017| 4640|4012| 90| 75| 1|\n",
157+
"|08.01.2017| 4320|3097| 114,46| 82,28| 1|\n",
158+
"|09.01.2017| 4640|2864| 132| 62,73| 1|\n",
159+
"|09.01.2017| 4640|2557| 99,12| 75,4| 1|\n",
160+
"|09.01.2017| 4440|3911| 122,96| 74| 1|\n",
161+
"|09.01.2017| 4320|2432| 93,15| 77,44| 1|\n",
162+
"|09.01.2017| 4560|4786| 115| 76,88| 1|\n",
163+
"|09.01.2017| 4560|4359| 98,28| 66,25| 1|\n",
164+
"|10.01.2017| 4680|2519| 120| 75,48| 1|\n",
165+
"|10.01.2017| 4640|4106| 108,1| 62,22| 1|\n",
166+
"|15.01.2017| 4280|4409| 116| 74,75| 1|\n",
167+
"+----------+--------+----+---------------------+-------------------+--------------+\n",
168+
"only showing top 20 rows\n",
169+
"\n"
170+
]
171+
},
172+
{
173+
"data": {
174+
"text/plain": [
175+
"1981"
176+
]
177+
},
178+
"execution_count": 65,
179+
"metadata": {},
180+
"output_type": "execute_result"
181+
}
182+
],
183+
"source": [
184+
"file = \"s3a://samplesdata/AssetSensorData.csv\"\n",
185+
"\n",
186+
"# Load our data.\n",
187+
"print(\"Reading data from object store\")\n",
188+
"# Load our data.\n",
189+
"df = (spark.read.format(\"csv\")\n",
190+
" .option(\"inferSchema\", \"true\")\n",
191+
" .option(\"header\",\"true\")\n",
192+
" .option(\"multiLine\", \"false\")\n",
193+
" .option(\"delimiter\",\";\")\n",
194+
" .option(\"dateFormat\",\"dd.MM.yyyy\")\n",
195+
" .load(file))\n",
196+
" \n",
197+
"\n",
198+
"df.show()\n",
199+
"df.count()"
200+
]
201+
},
202+
{
203+
"cell_type": "code",
204+
"execution_count": 59,
205+
"id": "044e42f1-1120-43b8-aa3c-23b05764f908",
206+
"metadata": {
207+
"tags": []
208+
},
209+
"outputs": [
210+
{
211+
"name": "stderr",
212+
"output_type": "stream",
213+
"text": [
214+
" \r"
215+
]
216+
},
217+
{
218+
"data": {
219+
"text/plain": [
220+
"695"
221+
]
222+
},
223+
"execution_count": 59,
224+
"metadata": {},
225+
"output_type": "execute_result"
226+
}
227+
],
228+
"source": [
229+
"file = \"s3a://baltrans/testdata_year=2024_month=2024-05_day=2024-05-06_hour=09_part-00000-d6d45e02-0c9b-401f-914c-588781770fb2.c000.snappy.parquet\"\n",
230+
"df = spark.read.parquet(file)\n",
231+
"df.count()\n"
232+
]
233+
},
234+
{
235+
"cell_type": "code",
236+
"execution_count": 66,
237+
"id": "841bcadc-c967-4d21-a8da-2c30cb205f10",
238+
"metadata": {
239+
"tags": []
240+
},
241+
"outputs": [
242+
{
243+
"name": "stderr",
244+
"output_type": "stream",
245+
"text": [
246+
" \r"
247+
]
248+
},
249+
{
250+
"data": {
251+
"text/plain": [
252+
"110002"
253+
]
254+
},
255+
"execution_count": 66,
256+
"metadata": {},
257+
"output_type": "execute_result"
258+
}
259+
],
260+
"source": [
261+
"file = \"s3a://samplesdata/balance_transaction.json\"\n",
262+
"df = spark.read.json(file)\n",
263+
"df.count()\n"
264+
]
265+
},
266+
{
267+
"cell_type": "code",
268+
"execution_count": 56,
269+
"id": "f8fe9a51-b91b-4239-b2e4-a1e77bb3dc00",
270+
"metadata": {
271+
"tags": []
272+
},
273+
"outputs": [],
274+
"source": [
275+
"spark.stop()\n"
276+
]
277+
},
278+
{
279+
"cell_type": "code",
280+
"execution_count": 69,
281+
"id": "1ddd0ce3-8fab-49d2-ab19-40d925d6eb7f",
282+
"metadata": {
283+
"tags": []
284+
},
285+
"outputs": [
286+
{
287+
"name": "stderr",
288+
"output_type": "stream",
289+
"text": [
290+
" \r"
291+
]
292+
},
293+
{
294+
"data": {
295+
"text/plain": [
296+
"528"
297+
]
298+
},
299+
"execution_count": 69,
300+
"metadata": {},
301+
"output_type": "execute_result"
302+
}
303+
],
304+
"source": [
305+
"file = \"s3a://samplesdata/smalldata.json\"\n",
306+
"df = spark.read.json(file)\n",
307+
"df.count()\n"
308+
]
309+
},
310+
{
311+
"cell_type": "code",
312+
"execution_count": null,
313+
"id": "f3e6b10b-0e7f-422f-bed2-3b373649433f",
314+
"metadata": {},
315+
"outputs": [],
316+
"source": []
317+
}
318+
],
319+
"metadata": {
320+
"kernelspec": {
321+
"display_name": "Python [conda env:pyspark32_p38_cpu_v3]",
322+
"language": "python",
323+
"name": "conda-env-pyspark32_p38_cpu_v3-py"
324+
},
325+
"language_info": {
326+
"codemirror_mode": {
327+
"name": "ipython",
328+
"version": 3
329+
},
330+
"file_extension": ".py",
331+
"mimetype": "text/x-python",
332+
"name": "python",
333+
"nbconvert_exporter": "python",
334+
"pygments_lexer": "ipython3",
335+
"version": "3.8.16"
336+
}
337+
},
338+
"nbformat": 4,
339+
"nbformat_minor": 5
340+
}

0 commit comments

Comments
 (0)