1+ #!/usr/bin/env python
2+ # -*- coding: utf-8 -*-
3+ # The line above is to signify that the script contains utf-8 encoded characters.
4+
5+ # Licensed under the Apache License, Version 2.0 (the "License");
6+ # you may not use this file except in compliance with the License.
7+ # You may obtain a copy of the License at
8+ #
9+ # http://www.apache.org/licenses/LICENSE-2.0
10+ #
11+ # Unless required by applicable law or agreed to in writing, software
12+ # distributed under the License is distributed on an "AS IS" BASIS,
13+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+ # See the License for the specific language governing permissions and
15+ # limitations under the License.
16+
17+ # Adapted from https://github.com/VertNet/bigquery
18+
19+ __author__ = "John Wieczorek"
20+ __contributors__ = "Javier Otegui, John Wieczorek"
21+ __copyright__ = "Copyright 2016 vertnet.org"
22+ __version__ = "bigquery_loader.py 2016-09-01T11:49+02:00"
23+
24+ from googleapis import CloudStorage as CS
25+ import csv
26+ #from datetime import datetime
27+
28+ def remove_GCS_files (cs , filelist ):
29+ '''
30+ Remove a list of files from GCS, where the filenames in the list are the paths
31+ within the bucket defined in cs
32+ (e.g., processed/YPM/9643f840-f762-11e1-a439-00145eb45e9a/2016-07-16-aa)
33+ '''
34+ n = 0
35+ for file in filelist :
36+ try :
37+ cs .delete_object (file )
38+ n += 1
39+ if n % 100 == 0 :
40+ print '%s files removed' % n
41+ except Exception , e :
42+ print 'Failed to remove file %s Exception: %s' % (file , e )
43+ return n
44+
45+ def get_file_list (inputfile ):
46+ if inputfile is None or len (inputfile .strip ())== 0 :
47+ print 'No file containing list of files given.'
48+ return None
49+ filelist = []
50+ with open (inputfile , 'rU' ) as data :
51+ reader = csv .DictReader (data , fieldnames = ['gcspath' ])
52+ for row in reader :
53+ file = row ['gcspath' ].split ('/' )[3 ]
54+ if file is not None and len (file .strip ())> 0 :
55+ filelist .append (file )
56+ print '%s' % file
57+ return filelist
58+
59+ def main ():
60+ '''
61+ Get the files to process from ./GCSFilesToDelete.txt
62+ Invoke without parameters as:
63+ python GCS_cleaner.py
64+ '''
65+ # Create a CloudStorage Manager to be able to access Google Cloud Storage based on
66+ # the downloads bucket.
67+ cs_cred = { "bucket_name" : "vn-downloads2" }
68+
69+ cs = CS .CloudStorage (cs_cred )
70+
71+ # A list of candidate files can be found by
72+ # gsutil ls -l gs://vn-downloads2 > GCSFilesToDelete.txt
73+ # then filter for those before 60 days ago.
74+
75+ filelist = get_file_list ('GCSFilesToDelete.txt' )
76+
77+ filesremoved = remove_GCS_files (cs , filelist )
78+
79+ # print '%s file(s) removed' % filesremoved
80+
81+ if __name__ == "__main__" :
82+ main ()
0 commit comments