@@ -114,15 +114,19 @@ def fetch_dynamic_foraging_data(client):
114114 # let's directly query the software name
115115 logger .warning ("fetching 'dynamic foraging' in software name..." )
116116 software_name_results = client .retrieve_docdb_records (
117- filter_query = {"session.data_streams.software.name" : "dynamic-foraging-task" ,
118- "name" : {"$not" : {"$regex" : ".*processed.*" }}, # only raw data
119- },
120- paginate_batch_size = 500
121- )
117+ filter_query = {
118+ "$or" : [
119+ {"session.data_streams.software.name" : "dynamic-foraging-task" },
120+ {"session.stimulus_epochs.software.name" : "dynamic-foraging-task" },
121+ ],
122+ "name" : {"$not" : {"$regex" : ".*processed.*" }}, # only raw data
123+ },
124+ paginate_batch_size = 500 ,
125+ )
122126 logger .warning (f"found { len (software_name_results )} results" )
123127
124- # there are more from the past that didn't specify modality correctly.
125- # until this is fixed, need to guess by asset name
128+ # there are more from the past that didn't specify modality correctly.
129+ # until this is fixed, need to guess by asset name
126130 logger .warning ("fetching FIP records by name..." )
127131 name_FIP_results = client .retrieve_docdb_records (
128132 filter_query = {"name" : {"$regex" : "^FIP.*" }},
@@ -134,32 +138,32 @@ def fetch_dynamic_foraging_data(client):
134138 unique_results_by_id = {** { r ['_id' ]: r for r in software_name_results }, ** { r ['_id' ]: r for r in name_FIP_results }}
135139 results = list (unique_results_by_id .values ())
136140 logger .warning (f"found { len (results )} unique results" )
137-
141+
138142 # make a dataframe
139143 records_df = pd .DataFrame .from_records ([map_record_to_dict (d ) for d in results ])
140-
144+
141145 # PREVIOUSLY, there are some sessions uploaded twice in two different locations.
142146 # let's filter out the ones in aind-ophys-data, a deprecated location
143- # this is no longer a problem-- so I'm taking off the drop but keeping the dupe check on.
147+ # this is no longer a problem-- so I'm taking off the drop but keeping the dupe check on.
144148 dup_df = records_df [records_df .duplicated ('session_name' ,keep = False )]
145149 dup_df = dup_df [dup_df .session_loc .str .contains ("aind-ophys-data" )]
146150 if len (dup_df ):
147151 logger .warning ('duplicated entries found, please fix' )
148152 # records_df = records_df.drop(dup_df.index.values)
149-
153+
150154 # let's get processed results too
151155 logger .warning ("fetching processed results..." )
152156 processed_results = client .retrieve_docdb_records (filter_query = {
153157 "name" : {"$regex" : "^behavior_.*processed_.*" }
154158 })
155-
159+
156160 # converting to a dictionary
157161 processed_results_by_name = { r ['name' ]: r for r in processed_results }
158-
162+
159163 # adding two columns to our master dataframe - result name and result s3 location
160164 records_df ['processed_session_name' ] = records_df .session_name .apply (lambda x : find_result (x , processed_results_by_name ).get ('name' ))
161165 records_df ['processed_session_loc' ] = records_df .session_name .apply (lambda x : find_result (x , processed_results_by_name ).get ('location' ))
162- # get external_links, strip it down to the string
166+ # get external_links, strip it down to the string
163167 co_data_asset_id_processed = records_df .session_name .apply (lambda x : find_result (x , processed_results_by_name ).get ('external_links' ))
164168 records_df ['processed_CO_dataID' ] = strip_dict_for_id (co_data_asset_id_processed )
165169 records_df ['CO_dataID' ] = strip_dict_for_id (records_df ['CO_dataID' ])
0 commit comments