-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path0_bootstrap.py
More file actions
executable file
·60 lines (50 loc) · 2.22 KB
/
0_bootstrap.py
File metadata and controls
executable file
·60 lines (50 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
## Part 0: Bootstrap File
# You need to at the start of the project. It will install the requirements, creates the
# STORAGE environment variable and copy the data from
# raw/WA_Fn-UseC_-Telco-Customer-Churn-.csv into /datalake/data/churn of the STORAGE
# location.
# The STORAGE environment variable is the Cloud Storage location used by the DataLake
# to store hive data. On AWS it will s3a://[something], on Azure it will be
# abfs://[something] and on CDSW cluster, it will be hdfs://[something]
# Install the requirements
!pip3 install -r requirements.txt
# Create the directories and upload data
from cmlbootstrap import CMLBootstrap
from IPython.display import Javascript, HTML
import os
import time
import json
import requests
import xml.etree.ElementTree as ET
import datetime
run_time_suffix = datetime.datetime.now()
run_time_suffix = run_time_suffix.strftime("%d%m%Y%H%M%S")
# Set the setup variables needed by CMLBootstrap
HOST = os.getenv("CDSW_API_URL").split(
":")[0] + "://" + os.getenv("CDSW_DOMAIN")
USERNAME = os.getenv("CDSW_PROJECT_URL").split(
"/")[6] # args.username # "vdibia"
API_KEY = os.getenv("CDSW_API_KEY")
PROJECT_NAME = os.getenv("CDSW_PROJECT")
# Instantiate API Wrapper
cml = CMLBootstrap(HOST, USERNAME, API_KEY, PROJECT_NAME)
# Set the STORAGE environment variable
try :
storage=os.environ["STORAGE"]
except:
if os.path.exists("/etc/hadoop/conf/hive-site.xml"):
tree = ET.parse('/etc/hadoop/conf/hive-site.xml')
root = tree.getroot()
for prop in root.findall('property'):
if prop.find('name').text == "hive.metastore.warehouse.dir":
storage = prop.find('value').text.split("/")[0] + "//" + prop.find('value').text.split("/")[2]
else:
storage = "/user/" + os.getenv("HADOOP_USER_NAME")
storage_environment_params = {"STORAGE":storage}
storage_environment = cml.create_environment_variable(storage_environment_params)
os.environ["STORAGE"] = storage
# Upload the data to the cloud storage
!hdfs dfs -mkdir -p $STORAGE/datalake
!hdfs dfs -mkdir -p $STORAGE/datalake/data
!hdfs dfs -mkdir -p $STORAGE/datalake/data/churn
!hdfs dfs -copyFromLocal /home/cdsw/raw/WA_Fn-UseC_-Telco-Customer-Churn-.csv $STORAGE/datalake/data/churn/WA_Fn-UseC_-Telco-Customer-Churn-.csv