Skip to content

Commit 55561fc

Browse files
author
tucotuco
committed
Added a script to remove download files from vn-downloads2 bucket on Google Cloud storage. Use this every month or so.
1 parent 94c9700 commit 55561fc

File tree

1 file changed

+82
-0
lines changed

1 file changed

+82
-0
lines changed

lib/GCS_cleaner.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
# The line above is to signify that the script contains utf-8 encoded characters.
4+
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# Adapted from https://github.com/VertNet/bigquery
18+
19+
__author__ = "John Wieczorek"
20+
__contributors__ = "Javier Otegui, John Wieczorek"
21+
__copyright__ = "Copyright 2016 vertnet.org"
22+
__version__ = "bigquery_loader.py 2016-09-01T11:49+02:00"
23+
24+
from googleapis import CloudStorage as CS
25+
import csv
26+
#from datetime import datetime
27+
28+
def remove_GCS_files(cs, filelist):
29+
'''
30+
Remove a list of files from GCS, where the filenames in the list are the paths
31+
within the bucket defined in cs
32+
(e.g., processed/YPM/9643f840-f762-11e1-a439-00145eb45e9a/2016-07-16-aa)
33+
'''
34+
n = 0
35+
for file in filelist:
36+
try:
37+
cs.delete_object(file)
38+
n += 1
39+
if n%100 == 0:
40+
print '%s files removed' % n
41+
except Exception, e:
42+
print 'Failed to remove file %s Exception: %s' % (file, e)
43+
return n
44+
45+
def get_file_list(inputfile):
46+
if inputfile is None or len(inputfile.strip())==0:
47+
print 'No file containing list of files given.'
48+
return None
49+
filelist = []
50+
with open(inputfile, 'rU') as data:
51+
reader = csv.DictReader(data, fieldnames=['gcspath'])
52+
for row in reader:
53+
file = row['gcspath'].split('/')[3]
54+
if file is not None and len(file.strip())>0:
55+
filelist.append(file)
56+
print '%s' % file
57+
return filelist
58+
59+
def main():
60+
'''
61+
Get the files to process from ./GCSFilesToDelete.txt
62+
Invoke without parameters as:
63+
python GCS_cleaner.py
64+
'''
65+
# Create a CloudStorage Manager to be able to access Google Cloud Storage based on
66+
# the downloads bucket.
67+
cs_cred = { "bucket_name": "vn-downloads2" }
68+
69+
cs = CS.CloudStorage(cs_cred)
70+
71+
# A list of candidate files can be found by
72+
# gsutil ls -l gs://vn-downloads2 > GCSFilesToDelete.txt
73+
# then filter for those before 60 days ago.
74+
75+
filelist = get_file_list('GCSFilesToDelete.txt')
76+
77+
filesremoved = remove_GCS_files(cs, filelist)
78+
79+
# print '%s file(s) removed' % filesremoved
80+
81+
if __name__ == "__main__":
82+
main()

0 commit comments

Comments
 (0)