1+ from addbiomechanics .commands .abtract_command import AbstractCommand
2+ import argparse
3+ from addbiomechanics .auth import AuthContext
4+ import os
5+ from datetime import datetime
6+ from addbiomechanics .s3_structure import S3Node , retrieve_s3_structure , sizeof_fmt
7+ from botocore .exceptions import NoCredentialsError , PartialCredentialsError , ClientError
8+ from typing import List , Dict , Tuple , Set , Optional
9+ import json
10+ import re
11+
12+
13+ class SubjectToDownload :
14+ path : str
15+ contained_files : List [Tuple [str , int , str ]]
16+ is_reviewed : bool
17+ username : str
18+
19+ def __init__ (self , path : str , contained_files : List [Tuple [str , int , str ]]):
20+ self .path = path
21+ self .contained_files = contained_files
22+ contained_file_names : List [str ] = [
23+ os .path .basename (file [0 ]) for file in contained_files
24+ ]
25+ self .is_reviewed = 'REVIEWED' in contained_file_names
26+
27+ print ('Creating SubjectToDownload for ' + path + ' with ' + str (len (contained_files )) + ' files. Is reviewed: ' + str (self .is_reviewed ))
28+ print ('Num REVIEWED flags: ' + str (len ([file for file in contained_file_names if file == 'REVIEWED' ])))
29+
30+ # The username_pattern is: "us-west-2:" followed by any number of non-forward-slash characters,
31+ # ending at the first forward slash.
32+ username_pattern = r"us-west-2:[^/]*/"
33+
34+ # Ensure we capture the username for constructing our ATTRIBUTION.txt file
35+ match = re .search (username_pattern , path )
36+ if match :
37+ self .username = match .group (0 )
38+ else :
39+ self .username = 'Anonymous'
40+
41+
42+ class GenerateCreditsCommand (AbstractCommand ):
43+ def register_subcommand (self , subparsers : argparse ._SubParsersAction ):
44+ download_parser = subparsers .add_parser (
45+ 'generate-credits' , help = 'Generate credits for a dataset from AddBiomechanics' )
46+ download_parser .add_argument ('--pattern' ,
47+ type = str ,
48+ default = None ,
49+ help = 'The regex to match subjects to be downloaded.' )
50+ download_parser .add_argument ('--prefix' ,
51+ type = str ,
52+ default = 'standardized/' ,
53+ help = 'The folder prefix to match when listing potential files to download.' )
54+ download_parser .add_argument ('--marker-error-cutoff' , type = float ,
55+ help = 'The maximum marker RMSE (in meters) we will tolerate. Files that match the '
56+ 'regex pattern but are from subjects that are above bbove this threshold will'
57+ ' not be downloaded.' , default = None )
58+ download_parser .add_argument ('--reviewed-only' ,
59+ action = 'store_true' ,
60+ help = 'Only download files from subjects that are fully reviewed.' )
61+
62+ def run (self , ctx : AuthContext , args : argparse .Namespace ):
63+ if args .command != 'generate-credits' :
64+ return
65+ pattern : Optional [str ] = args .pattern
66+ prefix : str = args .prefix
67+ marker_error_cutoff = args .marker_error_cutoff
68+ reviewed_only = args .reviewed_only
69+
70+ # Compile the pattern as a regex
71+ regex = re .compile (pattern ) if pattern is not None else None
72+
73+ s3 = ctx .aws_session .client ('s3' )
74+
75+ print (f'Listing files on S3 at { prefix } in bucket { ctx .deployment ["BUCKET" ]} ...' )
76+ response = s3 .list_objects_v2 (
77+ Bucket = ctx .deployment ['BUCKET' ], Prefix = prefix )
78+
79+ files : List [Tuple [str , int , str ]] = []
80+ keys : List [str ] = []
81+
82+ print (f'Listing files on S3 at { prefix } ...' )
83+ while True :
84+ if 'Contents' in response :
85+ for obj in response ['Contents' ]:
86+ key : str = obj ['Key' ]
87+ size : int = obj ['Size' ]
88+ e_tag : str = obj ['ETag' ]
89+ files .append ((key , size , e_tag ))
90+ keys .append (key )
91+
92+ # Check if there are more objects to retrieve
93+ if response ['IsTruncated' ]:
94+ print (f'Have { len (files )} files so far. Listing next page of files to download at { prefix } ...' )
95+ continuation_token = response ['NextContinuationToken' ]
96+ response = s3 .list_objects_v2 (
97+ Bucket = ctx .deployment ['BUCKET' ], Prefix = prefix , ContinuationToken = continuation_token , MaxKeys = 10000 )
98+ else :
99+ print (f'Finished listing files to download at { prefix } . Found { len (files )} files.' )
100+ break
101+
102+ subject_paths : List [str ] = []
103+ for key , size , e_tag in files :
104+ if key .endswith ("_subject.json" ):
105+ subject_paths .append (key .replace ("_subject.json" , "" ))
106+
107+ subject_file_sets : Dict [str , List [Tuple [str , int , str ]]] = {}
108+ for key , size , e_tag in files :
109+ for subject_path in subject_paths :
110+ if key .startswith (subject_path ):
111+ if subject_path not in subject_file_sets :
112+ subject_file_sets [subject_path ] = []
113+ subject_file_sets [subject_path ].append ((key , size , e_tag ))
114+ break
115+
116+ subjects : List [SubjectToDownload ] = []
117+ for subject_path in subject_paths :
118+ if subject_path in subject_file_sets :
119+ subjects .append (SubjectToDownload (subject_path , subject_file_sets [subject_path ]))
120+
121+ print (f'Found { len (subjects )} subjects to download.' )
122+
123+ if reviewed_only :
124+ subjects = [subject for subject in subjects if subject .is_reviewed ]
125+ print (f'After filtering for subjects that have been reviewed, have { len (subjects )} subjects to download.' )
126+
127+ if regex is not None :
128+ subjects = [subject for subject in subjects if regex .match (subject .path )]
129+ print (f'After filtering for regex "{ pattern } " on subject paths, have { len (subjects )} subjects to download.' )
130+
131+ if marker_error_cutoff is not None :
132+ skip_files : List [bool ] = []
133+ for subject in subjects :
134+ results_key = subject .path + "_results.json"
135+ if results_key in keys :
136+ try :
137+ response = s3 .get_object (Bucket = ctx .deployment ['BUCKET' ], Key = results_key )
138+ file_content = response ['Body' ].read ().decode ('utf-8' )
139+ results_json = json .loads (file_content )
140+ if 'autoAvgRMSE' in results_json :
141+ error_meters = results_json ['autoAvgRMSE' ]
142+ if error_meters > marker_error_cutoff :
143+ print ('!! Skipping ' + key + ' because the marker error is ' +
144+ str (results_json ['autoAvgRMSE' ]) + ' m' )
145+ skip_file = True
146+ break
147+ else :
148+ print ('Including ' + key + ' because the marker error is ' +
149+ str (results_json ['autoAvgRMSE' ]) + ' m' )
150+ except Exception as e :
151+ print ('!! Skipping ' + key + ' because we could not read the results file.' )
152+ skip_file = True
153+ skip_files .append (skip_file )
154+
155+ subjects = [subject for i , subject in enumerate (subjects ) if not skip_files [i ]]
156+ print (f'After filtering for marker error cutoff, have { len (subjects )} subjects to download.' )
157+
158+ usernames : Set [str ] = set ([subject .username for subject in subjects ])
159+ to_download : List [str ] = []
160+ to_download_e_tags : List [str ] = []
161+ to_download_sizes : List [int ] = []
162+ to_download_size : int = 0
163+ already_downloaded_size : int = 0
164+
165+ for subject in subjects :
166+ for key , size , e_tag in subject .contained_files :
167+ if size > 0 and e_tag in to_download_e_tags :
168+ continue
169+ if key .endswith ('.b3d' ) or key .endswith ('review.json' ) or key .endswith ('REVIEWED' ):
170+ to_download .append (key )
171+ to_download_e_tags .append (e_tag )
172+ to_download_sizes .append (size )
173+ to_download_size += size
174+
175+ print ('A total of ' + str (len (usernames ))+ ' AddBiomechanics users will be credited in the ATTRIBUTION.txt file.' )
176+
177+ credit_list : List [str ] = []
178+ for username in usernames :
179+ credit = username
180+ profile_link = 'https://' + ('dev' if ctx .deployment ['NAME' ] == 'DEV' else 'app' ) + '.addbiomechanics.org/profile/' + username .replace ('us-west-2:' , '' )
181+
182+ # Try to get the profile.json file, if it exists
183+ profile_key : str = "protected/" + str (username ) + "profile.json"
184+ try :
185+ response = s3 .get_object (Bucket = ctx .deployment ['BUCKET' ], Key = profile_key )
186+ file_content = response ['Body' ].read ().decode ('utf-8' )
187+ profile_json = json .loads (file_content )
188+ name = ''
189+ surname = ''
190+ if 'name' in profile_json :
191+ name = profile_json ['name' ]
192+ if 'surname' in profile_json :
193+ surname = profile_json ['surname' ]
194+ if name != '' or surname != '' :
195+ credit = name + ' ' + surname + ' (' + profile_link + ')'
196+ except Exception as e :
197+ credit = 'Anonymous (' + profile_link + ')'
198+ pass
199+
200+ credit_list .append (credit )
201+
202+ data_credits = 'Data Licensed as Creative Commons BY 4.0 (See https://creativecommons.org/licenses/by/4.0/ for details)\n Credits:\n '
203+ for credit in credit_list :
204+ data_credits += ' - ' + credit + '\n '
205+ print (data_credits )
206+ with open ('DATA_LICENSE.txt' if ctx .deployment ['NAME' ] == 'PROD' else 'DATA_LICENSE_DEV_SERVER.txt' , 'w' ) as f :
207+ f .write (data_credits )
0 commit comments