1+ import sys
2+ sys .path .append ('airflow' )
3+
4+ from airflow .models import DAG , TaskInstance , BaseOperator
5+ from hook .twitter_hook import TwitterHook
6+ from datetime import datetime , timedelta
7+ from os .path import join
8+ from pathlib import Path
9+ import json
10+
11+ class TwitterOperator (BaseOperator ):
12+ def __init__ (self , file_path , start_time , end_time , query , ** kwargs ):
13+ self .file_path = file_path
14+ self .start_time = start_time
15+ self .end_time = end_time
16+ self .query = query
17+ super ().__init__ (** kwargs )
18+
19+ def create_parent_folder (self ):
20+ Path (self .file_path ).parent .mkdir (parents = True , exist_ok = True )
21+
22+ def execute (self , context ):
23+ start_time = self .start_time
24+ end_time = self .end_time
25+ query = self .query
26+
27+ self .create_parent_folder ()
28+ with open (self .file_path , 'w' ) as output_file :
29+ for pg in TwitterHook (start_time , end_time , query ).run ():
30+ json .dump (pg , output_file , ensure_ascii = False )
31+ output_file .write ('\n ' )
32+
33+ if __name__ == '__main__' :
34+ timestamp_format = '%Y-%m-%dT%H:%M:%S.00Z'
35+
36+ start_time = (datetime .now () + timedelta (- 1 )).date ().strftime (timestamp_format )
37+ end_time = datetime .now ().strftime (timestamp_format )
38+ query = "datascience"
39+
40+ with DAG (dag_id = 'TwitterTest' , start_date = datetime .now ()) as dag :
41+ to = TwitterOperator (
42+ task_id = 'test_run' ,
43+ file_path = join (
44+ 'datalake' ,
45+ 'twitter_datascience' ,
46+ f'extract_date={ datetime .now ().date ()} ' ,
47+ f'datascience_{ datetime .now ().date ().strftime ("%Y%m%d" )} .json'
48+ ),
49+ start_time = start_time ,
50+ end_time = end_time ,
51+ query = query
52+ )
53+ ti = TaskInstance (task = to )
54+ to .execute (ti .task_id )
0 commit comments