Skip to content

Commit 9ea3e81

Browse files
authored
Merge pull request #104 from AllenNeuralDynamics/han_softwarename_to_stimulus_epochs
add querying "dynamic-foraging-task" from stimulus_epochs
2 parents 92bcac6 + c2e7e22 commit 9ea3e81

File tree

1 file changed

+18
-14
lines changed

1 file changed

+18
-14
lines changed

code/util/fetch_data_docDB.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -114,15 +114,19 @@ def fetch_dynamic_foraging_data(client):
114114
# let's directly query the software name
115115
logger.warning("fetching 'dynamic foraging' in software name...")
116116
software_name_results = client.retrieve_docdb_records(
117-
filter_query={"session.data_streams.software.name": "dynamic-foraging-task",
118-
"name": {"$not": {"$regex": ".*processed.*"}}, # only raw data
119-
},
120-
paginate_batch_size=500
121-
)
117+
filter_query={
118+
"$or": [
119+
{"session.data_streams.software.name": "dynamic-foraging-task"},
120+
{"session.stimulus_epochs.software.name": "dynamic-foraging-task"},
121+
],
122+
"name": {"$not": {"$regex": ".*processed.*"}}, # only raw data
123+
},
124+
paginate_batch_size=500,
125+
)
122126
logger.warning(f"found {len(software_name_results)} results")
123127

124-
# there are more from the past that didn't specify modality correctly.
125-
# until this is fixed, need to guess by asset name
128+
# there are more from the past that didn't specify modality correctly.
129+
# until this is fixed, need to guess by asset name
126130
logger.warning("fetching FIP records by name...")
127131
name_FIP_results = client.retrieve_docdb_records(
128132
filter_query={"name": {"$regex": "^FIP.*"}},
@@ -134,32 +138,32 @@ def fetch_dynamic_foraging_data(client):
134138
unique_results_by_id = {**{ r['_id']: r for r in software_name_results }, **{ r['_id']: r for r in name_FIP_results }}
135139
results = list(unique_results_by_id.values())
136140
logger.warning(f"found {len(results)} unique results")
137-
141+
138142
# make a dataframe
139143
records_df = pd.DataFrame.from_records([map_record_to_dict(d) for d in results ])
140-
144+
141145
# PREVIOUSLY, there are some sessions uploaded twice in two different locations.
142146
# let's filter out the ones in aind-ophys-data, a deprecated location
143-
# this is no longer a problem-- so I'm taking off the drop but keeping the dupe check on.
147+
# this is no longer a problem-- so I'm taking off the drop but keeping the dupe check on.
144148
dup_df = records_df[records_df.duplicated('session_name',keep=False)]
145149
dup_df = dup_df[dup_df.session_loc.str.contains("aind-ophys-data")]
146150
if len(dup_df):
147151
logger.warning('duplicated entries found, please fix')
148152
# records_df = records_df.drop(dup_df.index.values)
149-
153+
150154
# let's get processed results too
151155
logger.warning("fetching processed results...")
152156
processed_results = client.retrieve_docdb_records(filter_query={
153157
"name": {"$regex": "^behavior_.*processed_.*"}
154158
})
155-
159+
156160
# converting to a dictionary
157161
processed_results_by_name = { r['name']: r for r in processed_results }
158-
162+
159163
# adding two columns to our master dataframe - result name and result s3 location
160164
records_df['processed_session_name'] = records_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('name'))
161165
records_df['processed_session_loc'] = records_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('location'))
162-
# get external_links, strip it down to the string
166+
# get external_links, strip it down to the string
163167
co_data_asset_id_processed = records_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('external_links'))
164168
records_df['processed_CO_dataID'] = strip_dict_for_id(co_data_asset_id_processed)
165169
records_df['CO_dataID'] = strip_dict_for_id(records_df['CO_dataID'])

0 commit comments

Comments
 (0)