oracle-devrel
diff --git a/‎data-platform/open-source-data-platforms/oci-data-flow/code-examples/connect-to-AWS-S3/LICENSE
Lines changed: 35 additions & 0 deletions b/‎data-platform/open-source-data-platforms/oci-data-flow/code-examples/connect-to-AWS-S3/LICENSE
Lines changed: 35 additions & 0 deletions
diff --git a/‎data-platform/open-source-data-platforms/oci-data-flow/code-examples/connect-to-AWS-S3/README.md
Lines changed: 20 additions & 0 deletions b/‎data-platform/open-source-data-platforms/oci-data-flow/code-examples/connect-to-AWS-S3/README.md
Lines changed: 20 additions & 0 deletions
diff --git a/‎data-platform/open-source-data-platforms/oci-data-flow/code-examples/connect-to-AWS-S3/files/Spark-S3-AWS.ipynb
Lines changed: 340 additions & 0 deletions b/‎data-platform/open-source-data-platforms/oci-data-flow/code-examples/connect-to-AWS-S3/files/Spark-S3-AWS.ipynb
Lines changed: 340 additions & 0 deletions
@@ -0,0 +1,35 @@
+Copyright (c) 2024 Oracle and/or its affiliates.
+
+The Universal Permissive License (UPL), Version 1.0
+
+Subject to the condition set forth below, permission is hereby granted to any
+person obtaining a copy of this software, associated documentation and/or data
+(collectively the "Software"), free of charge and under any and all copyright
+rights in the Software, and any and all patent rights owned or freely
+licensable by each licensor hereunder covering either (i) the unmodified
+Software as contributed to or provided by such licensor, or (ii) the Larger
+Works (as defined below), to deal in both
+
+(a) the Software, and
+(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
+one is included with the Software (each a "Larger Work" to which the Software
+is contributed by such licensors),
+
+without restriction, including without limitation the rights to copy, create
+derivative works of, display, perform, and distribute the Software and make,
+use, sell, offer for sale, import, export, have made, and have sold the
+Software and the Larger Work(s), and to sublicense the foregoing rights on
+either these or other terms.
+
+This license is subject to the following condition:
+The above copyright notice and either this complete permission notice or at
+a minimum a reference to the UPL must be included in all copies or
+substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,20 @@
+# OCI Data Flow Connection to AWS S3
+
+Reviewed: 10.07.2024
+
+# When to use this asset?
+
+When using OCI Data Flow and you have to query from and/or push data to AWS S3.
+
+# How to use this asset?
+
+- Review the code in the notebook and add the code to your personal OCI Data Flow application.
+- Add your S3 credentials, ensure you are authenticated on AWS.
+
+# License
+
+Copyright (c) 2024 Oracle and/or its affiliates.
+
+Licensed under the Universal Permissive License (UPL), Version 1.0.
+
+See [LICENSE](https://github.com/oracle-devrel/technology-engineering/blob/main/LICENSE) for more details.
@@ -0,0 +1,340 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "398c4ebf-d456-4d3d-ba18-cd431c3b0290",
+   "metadata": {},
+   "source": [
+    "### OCI Data Science - Useful Tips\n",
+    "<details>\n",
+    "<summary><font size=\"2\">Check for Public Internet Access</font></summary>\n",
+    "\n",
+    "```python\n",
+    "import requests\n",
+    "response = requests.get(\"https://oracle.com\")\n",
+    "assert response.status_code==200, \"Internet connection failed\"\n",
+    "```\n",
+    "</details>\n",
+    "<details>\n",
+    "<summary><font size=\"2\">Helpful Documentation </font></summary>\n",
+    "<ul><li><a href=\"https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm\">Data Science Service Documentation</a></li>\n",
+    "<li><a href=\"https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html\">ADS documentation</a></li>\n",
+    "</ul>\n",
+    "</details>\n",
+    "<details>\n",
+    "<summary><font size=\"2\">Typical Cell Imports and Settings for ADS</font></summary>\n",
+    "\n",
+    "```python\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "%matplotlib inline\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "import logging\n",
+    "logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)\n",
+    "\n",
+    "import ads\n",
+    "from ads.dataset.factory import DatasetFactory\n",
+    "from ads.automl.provider import OracleAutoMLProvider\n",
+    "from ads.automl.driver import AutoML\n",
+    "from ads.evaluations.evaluator import ADSEvaluator\n",
+    "from ads.common.data import ADSData\n",
+    "from ads.explanations.explainer import ADSExplainer\n",
+    "from ads.explanations.mlx_global_explainer import MLXGlobalExplainer\n",
+    "from ads.explanations.mlx_local_explainer import MLXLocalExplainer\n",
+    "from ads.catalog.model import ModelCatalog\n",
+    "from ads.common.model_artifact import ModelArtifact\n",
+    "```\n",
+    "</details>\n",
+    "<details>\n",
+    "<summary><font size=\"2\">Useful Environment Variables</font></summary>\n",
+    "\n",
+    "```python\n",
+    "import os\n",
+    "print(os.environ[\"NB_SESSION_COMPARTMENT_OCID\"])\n",
+    "print(os.environ[\"PROJECT_OCID\"])\n",
+    "print(os.environ[\"USER_OCID\"])\n",
+    "print(os.environ[\"TENANCY_OCID\"])\n",
+    "print(os.environ[\"NB_REGION\"])\n",
+    "```\n",
+    "</details>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "7af22214-e7b7-4a66-90be-85f356523a3b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.conf import SparkConf\n",
+    "\n",
+    "\n",
+    "# Create a Spark session with your AWS Credentials\n",
+    "\n",
+    "conf = (\n",
+    "    SparkConf()\n",
+    "    .setAppName(\"MY_APP\") # replace with your desired name\n",
+    "    .set(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:3.2.1,com.amazonaws:aws-java-sdk-s3:1.11.655,com.amazonaws:aws-java-sdk-core:1.11.655,org.apache.spark:spark-hadoop-cloud_2.12:3.2.1\")\n",
+    "    .set(\"spark.hadoop.fs.s3a.access.key\", \"\")\n",
+    "    .set(\"spark.hadoop.fs.s3a.secret.key\", \"\")\n",
+    "    .set(\"spark.hadoop.fs.s3.impl\", \"org.apache.hadoop.fs.s3a.S3AFileSystem\")\n",
+    ")\n",
+    "\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "2d735772-887d-45ee-9437-a5738701c0d2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "spark.conf.set('spark.hadoop.fs.s3a.access.key', '')\n",
+    "spark.conf.set('spark.hadoop.fs.s3a.secret.key', '')\n",
+    "spark.conf.set('spark.hadoop.fs.s3a.path.style.access', 'true')\n",
+    "spark.conf.set(\"spark.hadoop.fs.s3a.impl\",\"org.apache.hadoop.fs.s3a.S3AFileSystem\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "bf8ecf4c-6943-4f28-9fc7-856ab30841b6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'org.apache.hadoop.fs.s3a.S3AFileSystem'"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "spark.conf.get('spark.hadoop.fs.s3.impl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "id": "c413d697-c749-4416-a95a-7833fe3317af",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading data from object store\n",
+      "+----------+--------+----+---------------------+-------------------+--------------+\n",
+      "|  DATE_KEY|PRESSURE| RPM|OPERATING_TEMPERATURE|BEARING_TEMPERATURE|MACHINE_STATUS|\n",
+      "+----------+--------+----+---------------------+-------------------+--------------+\n",
+      "|07.08.2016|    3700|5715|                   84|                 57|             0|\n",
+      "|09.08.2016|    3315|5582|                  116|                 69|             0|\n",
+      "|09.08.2016|    3179|2471|                   82|                 67|             0|\n",
+      "|07.01.2017|    4280|4793|                80,66|                 71|             1|\n",
+      "|07.01.2017|    4480|3086|                  120|                 71|             1|\n",
+      "|07.01.2017|    4280|2522|                 94,6|              76,86|             1|\n",
+      "|08.01.2017|    4320|4732|               121,98|              59,36|             1|\n",
+      "|08.01.2017|    4200|3105|                  112|              68,88|             1|\n",
+      "|08.01.2017|    4640|4436|                  119|              76,88|             1|\n",
+      "|08.01.2017|    4640|4012|                   90|                 75|             1|\n",
+      "|08.01.2017|    4320|3097|               114,46|              82,28|             1|\n",
+      "|09.01.2017|    4640|2864|                  132|              62,73|             1|\n",
+      "|09.01.2017|    4640|2557|                99,12|               75,4|             1|\n",
+      "|09.01.2017|    4440|3911|               122,96|                 74|             1|\n",
+      "|09.01.2017|    4320|2432|                93,15|              77,44|             1|\n",
+      "|09.01.2017|    4560|4786|                  115|              76,88|             1|\n",
+      "|09.01.2017|    4560|4359|                98,28|              66,25|             1|\n",
+      "|10.01.2017|    4680|2519|                  120|              75,48|             1|\n",
+      "|10.01.2017|    4640|4106|                108,1|              62,22|             1|\n",
+      "|15.01.2017|    4280|4409|                  116|              74,75|             1|\n",
+      "+----------+--------+----+---------------------+-------------------+--------------+\n",
+      "only showing top 20 rows\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1981"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "file = \"s3a://samplesdata/AssetSensorData.csv\"\n",
+    "\n",
+    "# Load our data.\n",
+    "print(\"Reading data from object store\")\n",
+    "# Load our data.\n",
+    "df = (spark.read.format(\"csv\")\n",
+    "      .option(\"inferSchema\", \"true\")\n",
+    "      .option(\"header\",\"true\")\n",
+    "      .option(\"multiLine\", \"false\")\n",
+    "      .option(\"delimiter\",\";\")\n",
+    "      .option(\"dateFormat\",\"dd.MM.yyyy\")\n",
+    "      .load(file))\n",
+    "      \n",
+    "\n",
+    "df.show()\n",
+    "df.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "044e42f1-1120-43b8-aa3c-23b05764f908",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "695"
+      ]
+     },
+     "execution_count": 59,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "file = \"s3a://baltrans/testdata_year=2024_month=2024-05_day=2024-05-06_hour=09_part-00000-d6d45e02-0c9b-401f-914c-588781770fb2.c000.snappy.parquet\"\n",
+    "df = spark.read.parquet(file)\n",
+    "df.count()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "841bcadc-c967-4d21-a8da-2c30cb205f10",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "110002"
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "file = \"s3a://samplesdata/balance_transaction.json\"\n",
+    "df = spark.read.json(file)\n",
+    "df.count()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "f8fe9a51-b91b-4239-b2e4-a1e77bb3dc00",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "spark.stop()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "1ddd0ce3-8fab-49d2-ab19-40d925d6eb7f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "528"
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "file = \"s3a://samplesdata/smalldata.json\"\n",
+    "df = spark.read.json(file)\n",
+    "df.count()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3e6b10b-0e7f-422f-bed2-3b373649433f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:pyspark32_p38_cpu_v3]",
+   "language": "python",
+   "name": "conda-env-pyspark32_p38_cpu_v3-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}