1- #!/usr/bin/python
1+ #!/usr/bin/env python3
22import csv
33import glob
4+ from multiprocessing import Process , Queue , current_process
45from nif_walker import walk_nif
56import os
67import sys
@@ -36,79 +37,140 @@ def find_match(ap, rap):
3637 return False
3738
3839
39- with open ("Manifests/UIXR.manifest" ) as f :
40- uixr_data = csv .DictReader (f )
41- tar_ball = tarfile .open (RELEASE_PATH , "w:" )
40+ def retrieve_assets_from_nif (nif_path ):
41+ # check that we have all related assets
42+ assets = []
43+ nif_assets = walk_nif (nif_path = nif_path , use_stdout = False )
44+ for asset_string in nif_assets :
45+ for asset_name in asset_string .split (', ' ):
46+ assets .append (get_filename (asset_name ).lower ())
47+ return assets
48+
49+
50+ def worker (work_queue , done_queue ):
4251 spinner = spinning_cursor ()
43- manifest = []
44- additional_assets = []
45- print ("Gathering assets: " )
46- for row in uixr_data :
47- row_license = row .get ('license' ).lower ()
48- if row_license == 'cc0' or row_license == 'cc-by' or row_license == 'cc-by-nc' :
49- sys .stdout .write ("\033 [K" )
50- sys .stdout .write (next (spinner ))
51- sys .stdout .write (" [{0}]" .format (row .get ('asset' )))
52- sys .stdout .flush ()
53- sys .stdout .write ('\r \b ' )
54- file_path = os .path .join (ASSET_PATH , row .get ('asset' ))
55- if not os .path .exists (file_path ):
56- print ("WARNING: asset not found -> {0}" .format (row .get ('asset' )))
57- continue
58-
59- manifest .append (row .get ('asset' ))
60- tar_ball .add (file_path , row .get ('asset' ))
61-
62- # check that we have all related assets
63- nif_assets = walk_nif (nif_path = file_path , use_stdout = False )
64- for asset_string in nif_assets :
65- for asset in asset_string .split (', ' ):
66- additional_assets .append (get_filename (asset ).lower ())
67-
68- # remove duplicates
69- additional_assets = set (additional_assets )
70-
71- # remove blank entry in set
72- if '' in additional_assets :
73- additional_assets .remove ('' )
74-
75- # remove files listed in manifest, we already have them
76- for asset in manifest :
77- filename = os .path .basename (asset ).lower ()
78- if filename in additional_assets :
79- additional_assets .remove (filename )
80-
81- print ("Gathering sub-assets: " )
82- # iterate through all sub assets
83- for nif_asset in additional_assets :
84- found = False
85- na_filename , _ = os .path .splitext (nif_asset .lower ())
86- na_filename += '.*'
87- glob_path = os .path .join (ASSET_PATH , "UIX" , "**" , na_filename )
88- for asset_path in insensitive_glob (glob_path , True ):
89- found = True
90- relative_asset_path = os .path .relpath (os .path .realpath (asset_path ), ASSET_PATH )
91- if relative_asset_path in tar_ball .getnames ():
92- break # file already exists, skip
93- sys .stdout .write ("\033 [K" )
94- sys .stdout .write (next (spinner ))
95- sys .stdout .write (" [{0}]" .format (relative_asset_path ))
96- sys .stdout .flush ()
97- sys .stdout .write ('\r \b ' )
98- f .seek (0 ) # reset to beginning of csv file
99- for row in uixr_data :
100- if find_match (row .get ('asset' ), relative_asset_path ):
101- row_license = row .get ('license' ).lower ()
102- if row_license == 'cc0' :
103- break # good to go, break
104- elif row_license == 'cc-by' :
105- break # good to go, break
106- elif row_license == 'cc-by-nc' :
107- break # good to go, break
108- else :
109- print ("WARNING: Non-CC license asset -> {0}" .format (relative_asset_path ))
110- found = False
111- tar_ball .add (asset_path , relative_asset_path )
112- if not found :
113- print ("WARNING: sub-asset not found -> {0}" .format (nif_asset ))
114- tar_ball .close ()
52+ p = current_process ()
53+ for nif_path in iter (work_queue .get , 'STOP' ):
54+ sys .stdout .write ("\033 [K" )
55+ sys .stdout .write (next (spinner ))
56+ sys .stdout .write (" [{0}][{1}][{2}]" .format (work_queue .qsize (), p .name , nif_path ))
57+ sys .stdout .flush ()
58+ sys .stdout .write ('\r \b ' )
59+ assets = []
60+ try :
61+ # assets.append('DEADBEEF')
62+ assets = retrieve_assets_from_nif (nif_path )
63+ except Exception :
64+ pass
65+ done_queue .put ((p .name , nif_path , assets ))
66+ done_queue .put ('STOP' )
67+ print ("{} finished." .format (p .name ))
68+ return True
69+
70+
71+ def main ():
72+ with open ("Manifests/UIXR.manifest" ) as f :
73+ uixr_data = csv .DictReader (f )
74+ tar_ball = tarfile .open (RELEASE_PATH , "w:" )
75+ spinner = spinning_cursor ()
76+ manifest = []
77+ additional_assets = []
78+
79+ # setup multi-processing job
80+ workers = 8
81+ processes = []
82+ work_queue = Queue ()
83+ done_queue = Queue ()
84+
85+ print ("Gathering assets: " )
86+ for row in uixr_data :
87+ row_license = row .get ('license' ).lower ()
88+ if row_license == 'cc0' or row_license == 'cc-by' or row_license == 'cc-by-nc' :
89+ sys .stdout .write ("\033 [K" )
90+ sys .stdout .write (next (spinner ))
91+ sys .stdout .write (" [{0}]" .format (row .get ('asset' )))
92+ sys .stdout .flush ()
93+ sys .stdout .write ('\r \b ' )
94+ file_path = os .path .join (ASSET_PATH , row .get ('asset' ))
95+ if not os .path .exists (file_path ):
96+ print ("WARNING: asset not found -> {0}" .format (row .get ('asset' )))
97+ continue
98+ manifest .append (row .get ('asset' ))
99+ tar_ball .add (file_path , row .get ('asset' ))
100+
101+ work_queue .put (file_path )
102+
103+ # let multiprocessing parse the nifs
104+ print ("Parsing NIFs for additional sub-assets: " )
105+ for i in range (workers ):
106+ p = Process (target = worker , args = (work_queue , done_queue ))
107+ processes .append (p )
108+ work_queue .put ('STOP' )
109+ p .start ()
110+
111+ print ("Building assets to validate." )
112+ stops = 0
113+ while True :
114+ item = done_queue .get ()
115+ if isinstance (item , str ) and item == 'STOP' :
116+ stops += 1
117+ if stops == workers :
118+ break
119+ else :
120+ # for worker_name, nif_name, nif_assets in iter(done_queue.get, 'STOP'):
121+ _ , _ , nif_assets = item
122+ additional_assets += nif_assets
123+
124+ print ("Filtering assets." )
125+ # remove duplicates
126+ additional_assets = set (additional_assets )
127+
128+ # remove blank entry in set
129+ if '' in additional_assets :
130+ additional_assets .remove ('' )
131+
132+ # remove files listed in manifest, we already have them
133+ for asset in manifest :
134+ filename = os .path .basename (asset ).lower ()
135+ if filename in additional_assets :
136+ additional_assets .remove (filename )
137+
138+ print ("Gathering sub-assets: " )
139+ # iterate through all sub assets
140+ for nif_asset in additional_assets :
141+ found = False
142+ na_filename , _ = os .path .splitext (nif_asset .lower ())
143+ na_filename += '.*'
144+ glob_path = os .path .join (ASSET_PATH , "UIX" , "**" , na_filename )
145+ for asset_path in insensitive_glob (glob_path , True ):
146+ found = True
147+ relative_asset_path = os .path .relpath (os .path .realpath (asset_path ), ASSET_PATH )
148+ if relative_asset_path in tar_ball .getnames ():
149+ break # file already exists, skip
150+ sys .stdout .write ("\033 [K" )
151+ sys .stdout .write (next (spinner ))
152+ sys .stdout .write (" [{0}]" .format (relative_asset_path ))
153+ sys .stdout .flush ()
154+ sys .stdout .write ('\r \b ' )
155+ f .seek (0 ) # reset to beginning of csv file
156+ for row in uixr_data :
157+ if find_match (row .get ('asset' ), relative_asset_path ):
158+ row_license = row .get ('license' ).lower ()
159+ if row_license == 'cc0' :
160+ break # good to go, break
161+ elif row_license == 'cc-by' :
162+ break # good to go, break
163+ elif row_license == 'cc-by-nc' :
164+ break # good to go, break
165+ else :
166+ print ("WARNING: Non-CC license asset -> {0}" .format (relative_asset_path ))
167+ found = False
168+ tar_ball .add (asset_path , relative_asset_path )
169+ if not found :
170+ print ("WARNING: sub-asset not found -> {0}" .format (nif_asset ))
171+
172+ tar_ball .close ()
173+
174+
175+ if __name__ == "__main__" :
176+ main ()
0 commit comments