Skip to content

Commit 0f87a9f

Browse files
committed
init comit
1 parent 8a73fb4 commit 0f87a9f

File tree

8 files changed

+618
-2
lines changed

8 files changed

+618
-2
lines changed

README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
# salesforce_plugin
2-
Move Data From Salesforce -> S3 -> Redshift
1+
# Airflow Plugin - Salesforce

__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from airflow.plugins_manager import AirflowPlugin
2+
from hooks.salesforce_hook import SalesforceHook
3+
from operators.s3_to_redshift_operator import S3ToRedshiftOperator
4+
from operators.salesforce_schema_to_redshift_operator import SalesforceSchemaToRedshiftOperator
5+
from operators.salesforce_to_s3_operator import SalesforceBulkQueryToS3Operator
6+
7+
class SalesforceToRedshiftPlugin(AirflowPlugin):
8+
name = "salesforce_to_redshift_plugin"
9+
hooks = [SalesforceHook]
10+
operators = [S3ToRedshiftOperator, SalesforceSchemaToRedshiftOperator, SalesforceBulkQueryToS3Operator]
11+
executors = []
12+
macros = []
13+
admin_views = []
14+
flask_blueprints = []
15+
menu_links = []

hooks/__init__.py

Whitespace-only changes.

hooks/salesforce_hook.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
from airflow.hooks.base_hook import BaseHook
2+
from simple_salesforce import Salesforce
3+
4+
class SalesforceHook(BaseHook):
5+
def __init__(
6+
self,
7+
conn_id,
8+
*args,
9+
**kwargs
10+
):
11+
"""
12+
Borrowed from airflow.contrib
13+
14+
Create new connection to Salesforce
15+
and allows you to pull data out of SFDC and save it to a file.
16+
You can then use that file with other
17+
Airflow operators to move the data into another data source
18+
:param conn_id: the name of the connection that has the parameters
19+
we need to connect to Salesforce.
20+
The conenction shoud be type `http` and include a
21+
user's security token in the `Extras` field.
22+
23+
.. note::
24+
For the HTTP connection type, you can include a
25+
JSON structure in the `Extras` field.
26+
We need a user's security token to connect to Salesforce.
27+
So we define it in the `Extras` field as:
28+
`{"security_token":"YOUR_SECRUITY_TOKEN"}`
29+
"""
30+
31+
self.sf = None
32+
self.conn_id = conn_id
33+
self._args = args
34+
self._kwargs = kwargs
35+
36+
# get the connection parameters
37+
self.connection = self.get_connection(conn_id)
38+
self.extras = self.connection.extra_dejson
39+
40+
def get_conn(self):
41+
"""
42+
Sign into Salesforce.
43+
If we have already signed it, this will just return the original object
44+
"""
45+
if self.sf:
46+
return self.sf
47+
48+
auth_type = self.extras.get('auth_type', 'password')
49+
50+
if auth_type == 'direct':
51+
auth_kwargs = {
52+
'instance_url': self.connection.host,
53+
'session_id': self.connection.password
54+
}
55+
56+
else:
57+
auth_kwargs = {
58+
'username': self.connection.login,
59+
'password': self.connection.password,
60+
'security_token': self.extras.get('security_token'),
61+
'instance_url': self.connection.host
62+
}
63+
# connect to Salesforce
64+
self.sf = Salesforce(**auth_kwargs)
65+
66+
return self.sf

operators/__init__.py

Whitespace-only changes.
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
from airflow.models import BaseOperator
2+
3+
class S3ToRedshiftOperator(BaseOperator):
4+
"""
5+
S3 -> Redshift via COPY Commands
6+
"""
7+
8+
template_fields = ('s3_key','copy_cmd')
9+
10+
def __init__(self,
11+
s3_conn_id, s3_bucket, s3_key,
12+
rs_conn_id, rs_schema, rs_table,
13+
copy_cmd, load_type='append',
14+
join_key=None, incremental_key=None,
15+
*args, **kwargs):
16+
17+
super().__init__(*args, **kwargs)
18+
self.s3_conn_id = s3_conn_id
19+
self.s3_bucket = s3_bucket
20+
self.s3_key = s3_key
21+
22+
self.rs_conn_id = rs_conn_id
23+
self.rs_schema = rs_schema
24+
self.rs_table = rs_table
25+
26+
self.copy_cmd = copy_cmd
27+
self.load_type = load_type
28+
self.join_key = join_key
29+
self.incremental_key = incremental_key
30+
31+
# Used In Case of Upsert
32+
self.tmp_tbl = None
33+
self.tmp_schema = None
34+
35+
if self.load_type not in ["append", "upsert"]:
36+
raise Exception('Please choose "append", "rebuild", or "upsert".')
37+
38+
if self.load_type == 'upsert' and (self.join_key is None or self.incremental_key is None):
39+
raise Exception('Upserts require join_key and incremental_key to be specified')
40+
41+
def drop_tbl_ddl(self, schema, tbl, if_exists=True):
42+
base_drop = "DROP TABLE {if_exists} {schema}.{tbl}"
43+
44+
if_exists = 'if exists' if if_exists else ''
45+
46+
return base_drop.format(
47+
if_exists=if_exists,
48+
schema=schema,
49+
tbl=tbl
50+
)
51+
52+
def duplicate_tbl_schema(self, old_schema, old_tbl, new_tbl=None, new_schema=None):
53+
new_tbl = new_tbl if new_tbl is not None else old_tbl
54+
new_schema = new_schema if new_schema is not None else old_schema
55+
56+
cmd = 'CREATE TABLE {new_schema}.{new_tbl}(LIKE {old_schema}.{old_tbl});'
57+
58+
# give new_tbl a unique name in case of more than one task running
59+
rand4 = ''.join((choice(ascii_lowercase) for i in range(4)))
60+
new_tbl += '_tmp_' + rand4 if new_tbl==old_tbl else ''
61+
62+
self.tmp_tbl = new_tbl
63+
self.tmp_schema = new_schema
64+
65+
return cmd.format(
66+
new_schema=new_schema,
67+
new_tbl=new_tbl,
68+
old_schema=old_schema,
69+
old_tbl=old_tbl
70+
)
71+
72+
def del_from_tbl_ddl(self, del_schema, del_tbl, join_schema, join_tbl, conditions=None):
73+
delete = """DELETE FROM {src_schema}.{src_tbl} USING {join_schema}.{join_tbl} join_tbl"""
74+
75+
delete = delete.format(
76+
src_schema=del_schema,
77+
src_tbl=del_tbl,
78+
join_schema=join_schema,
79+
join_tbl=join_tbl
80+
)
81+
82+
if conditions:
83+
delete += '\nWHERE '
84+
delete += '\nAND '.join(conditions)
85+
86+
return delete
87+
88+
def insert_stg_into_dst_ddl(self, dst_schema, dst_tbl, stg_schema, stg_tbl):
89+
insert = """insert into {dst_schema}.{dst_tbl}\n (select * from {stg_schema}.{stg_tbl});"""
90+
91+
return insert.format(
92+
dst_schema=dst_schema,
93+
dst_tbl=dst_tbl,
94+
stg_schema=stg_schema,
95+
stg_tbl=stg_tbl
96+
)
97+
98+
def execute(self, context):
99+
"""
100+
Runs copy command on redshift
101+
"""
102+
pg = PostgresHook(postgres_conn_id=self.rs_conn_id)
103+
104+
a_key, s_key = S3Hook(s3_conn_id=self.s3_conn_id).get_credentials()
105+
conn_str = 'aws_access_key_id={};aws_secret_access_key={}'.format(a_key, s_key)
106+
107+
# If append -> normal copy into table
108+
if self.load_type == 'append':
109+
copy_cmd = self.copy_cmd.format(creds=conn_str, bucket=self.s3_bucket, key=self.s3_key)
110+
pg.run(copy_cmd)
111+
112+
else:
113+
# Duplicate Dst Tbl
114+
duplicate_tbl = self.duplicate_tbl_schema(self.rs_schema, self.rs_table)
115+
pg.run(duplicate_tbl)
116+
117+
copy_cmd = self.copy_cmd.format(creds=conn_str, bucket=self.s3_bucket, key=self.s3_key)
118+
pg.run(copy_cmd)
119+
120+
# DELETE Duplicate Rows
121+
del_conditions = [
122+
"{}.{} = join_tbl.{}".format(
123+
self.rs_table,
124+
self.join_key,
125+
self.join_key
126+
),
127+
"{}.{} < join_tbl.{}".format(
128+
self.rs_table,
129+
self.incremental_key,
130+
self.incremental_key
131+
)
132+
]
133+
134+
del_ddl = self.del_from_tbl_ddl(
135+
self.rs_schema,
136+
self.rs_table,
137+
self.tmp_schema,
138+
self.tmp_tbl,
139+
del_conditions,
140+
)
141+
pg.run(del_ddl)
142+
143+
# Do Inserts
144+
insert_ddl = self.insert_stg_into_dst_ddl(
145+
self.rs_schema,
146+
self.rs_table,
147+
self.tmp_schema,
148+
self.tmp_tbl,
149+
)
150+
pg.run(insert_ddl)
151+
152+
# Cleanup Temp Table
153+
drop_ddl = self.drop_tbl_ddl(self.tmp_schema, self.tmp_tbl)
154+
pg.run(drop_ddl)

0 commit comments

Comments
 (0)