|
| 1 | +# --- |
| 2 | +# jupyter: |
| 3 | +# jupytext: |
| 4 | +# formats: ipynb,py:light |
| 5 | +# text_representation: |
| 6 | +# extension: .py |
| 7 | +# format_name: light |
| 8 | +# format_version: '1.5' |
| 9 | +# jupytext_version: 1.5.2 |
| 10 | +# kernelspec: |
| 11 | +# display_name: Python 3 |
| 12 | +# language: python |
| 13 | +# name: python3 |
| 14 | +# --- |
| 15 | + |
| 16 | +# # Working with automated computations: Imported tables |
| 17 | + |
| 18 | +print('go') |
| 19 | + |
| 20 | +# Welcome back! In this session, we are going to continue working with the pipeline for the mouse electrophysiology example. |
| 21 | +# |
| 22 | +# In this session, we will learn to: |
| 23 | +# |
| 24 | +# * import neuron activity data from data files into an `Imported` table |
| 25 | +# * automatically trigger computations for all missing entries with `populate` |
| 26 | + |
| 27 | +# First thing first, let's import `datajoint` again. |
| 28 | + |
| 29 | +import datajoint as dj |
| 30 | + |
| 31 | +# As we are going to perform some computations, let's go ahead and import NumPy as well as Matplotlib. |
| 32 | + |
| 33 | +import numpy as np |
| 34 | +import matplotlib.pyplot as plt |
| 35 | +# %matplotlib inline |
| 36 | + |
| 37 | +# Now we would like to continue working with the tables we defined in the previous notebook. To do so, we would need the classes for each table: `Mouse` and `Session`. We can either redefine it here, but for your convenience, we have included the schema and table class definitions in a package called `tutorial_pipeline.mouse_session`, from which you can import the classes as well as the schema object. We will use the schema object again to define more tables. |
| 38 | + |
| 39 | +import sys |
| 40 | +sys.path.append("..") |
| 41 | +from tutorial_pipeline.mouse_session import schema, Mouse, Session |
| 42 | + |
| 43 | +Mouse() |
| 44 | + |
| 45 | +Session() |
| 46 | + |
| 47 | +# The `mouse_session.py` also fills each table with data to make sure we are all on the same page. |
| 48 | + |
| 49 | +# ## Importing data from data files |
| 50 | + |
| 51 | +# Recall from the project description |
| 52 | +# > * In each experimental session, you record electrical activity from a single neuron. You use recording equipment that produces separate data files for each neuron you recorded. |
| 53 | +# |
| 54 | +# Our recording equipment produces a data file for each neuron recorded. Since we record from one neuron per session, there should be one data file for each session. |
| 55 | + |
| 56 | +# In the `data` directory, you will find `.npy` (saved NumPy array) files with names like `data_100_2017-05-25.npy`. |
| 57 | + |
| 58 | +# As you might have guessed, these are the data for the recording sessions in the `Session` table, and each file are named according to the `mouse_id` and `session_date` - the attributes of the primary keys - in the format `data_{mouse_id}_{session_date}.npy`. |
| 59 | +# |
| 60 | +# So `data_100_2017-05-25.npy` is the data for session identified by `mouse_id = 100` and `session_date = "2017-05-25"`. |
| 61 | + |
| 62 | +# ## Looking at the data |
| 63 | + |
| 64 | +# Let's take a quick peak at the data file content. |
| 65 | + |
| 66 | +# First, let's pick a session to load the data for. To do this we are going to first fetch the **primary key attributes** of `Session` as a list of dictionaries. We make use of the special `fetch('KEY')` syntax to achieve this. |
| 67 | + |
| 68 | +keys = Session.fetch('KEY') |
| 69 | +keys |
| 70 | + |
| 71 | +# Any item in this list of keys can be used to uniquely identify a single session! |
| 72 | + |
| 73 | +# ENTER YOUR CODE! - restrict session by an element of keys |
| 74 | + |
| 75 | + |
| 76 | +# Let's take the first key, and generate the file name that corresponds to this session. Remember the `data_{mouse_id}_{session_date}.npy` filename convetion! |
| 77 | + |
| 78 | +key = keys[0] |
| 79 | +key |
| 80 | + |
| 81 | +filename = 'data/data_{mouse_id}_{session_date}.npy'.format(**key) |
| 82 | +filename |
| 83 | + |
| 84 | +# Here we have made use of Python's dictionary unpacking and `format` method on strings to generate the filename from the `key`. |
| 85 | +# |
| 86 | +# Finally, let's load the file. |
| 87 | + |
| 88 | +data = np.load(filename) |
| 89 | + |
| 90 | +# Look at its content... |
| 91 | + |
| 92 | +data |
| 93 | + |
| 94 | +# ...and check the shape of the data. |
| 95 | + |
| 96 | +data.shape |
| 97 | + |
| 98 | + |
| 99 | +# So this particular file contains a NumPy array of size 1 x 1000. This represents a (simulated) recording of raw electric activity from neuron(s) (1st dimension) over 1000 time bins (2nd dimesion). |
| 100 | + |
| 101 | +# ## Defining the table for recorded neurons |
| 102 | + |
| 103 | +# We now would like to have all these recorded `Neuron` represented and stored in our data pipeline. |
| 104 | +# |
| 105 | +# Since there may be multiple neurons recorded from each session, a `Neuron` can be uniquely identified by knowing the `Session` it was recorded in, as well as its `neuron_id`. For each `Neuron`, we want to store the neural activity found in the data file. |
| 106 | + |
| 107 | +@schema |
| 108 | +class Neuron(dj.Imported): |
| 109 | + definition = """ |
| 110 | + -> Session |
| 111 | + neuron_id: int |
| 112 | + --- |
| 113 | + activity: longblob # electric activity of the neuron |
| 114 | + """ |
| 115 | + |
| 116 | +# Let's check the state of our pipeline. |
| 117 | + |
| 118 | +# ENTER YOUR CODE! - plot ERD of the schema |
| 119 | + |
| 120 | + |
| 121 | +# We defined `activity` as a `longblob` so that it can store a NumPy array holding the electric activity over time. This NumPy array will be imported from the file corresponding to each neuron. |
| 122 | + |
| 123 | +# Note that our `Neuron` class inherits from `dj.Imported` instead of `dj.Manual` like others. This is because **this table's content will depend on data imported from an external file**. The `Manual` vs `Imported` are said to specify the **tier of the table**. |
| 124 | + |
| 125 | +# ## DataJoint table tiers |
| 126 | + |
| 127 | +# In DataJoint, the tier of the table indicates **the nature of the data and the data source for the table**. So far we have encountered two table tiers: `Manual` and `Imported`, and we will encounter the two other major tiers in this session. |
| 128 | +# |
| 129 | +# DataJoint tables in `Manual` tier, or simply **Manual tables** indicate that its contents are **manually** entered by either experimenters or a recording system, and its content **do not depend on external data files or other tables**. This is the most basic table type you will encounter, especially as the tables at the beggining of the pipeline. In the ERD, `Manual` tables are depicted by green rectangles. |
| 130 | +# |
| 131 | +# On the other hand, **Imported tables** are understood to pull data (or *import* data) from external data files, and come equipped with functionalities to perform this importing process automatically, as we will see shortly! In the ERD, `Imported` tables are depicted by blue ellipses. |
| 132 | + |
| 133 | +dj.ERD(schema) |
| 134 | + |
| 135 | +# ## Importing data into the `Imported` table |
| 136 | + |
| 137 | +# Rather than filling out the content of the table manually using `insert1` or `insert` methods, we are going to make use of the `make` and `populate` logic that comes with `Imported` tables to automatically figure out what needs to be imported and perform the import! |
| 138 | + |
| 139 | +# ## `make` and `populate` methods |
| 140 | + |
| 141 | +# `Imported` table comes with a special method called `populate`. Let's try calling it. |
| 142 | + |
| 143 | +# ENTER YOUR CODE! - call `populate` on the table |
| 144 | + |
| 145 | + |
| 146 | +# Notice that `populate` call complained that a method called `make` is not implemented. Let me show a simple `make` method that will help elucidate what this is all about. |
| 147 | + |
| 148 | +@schema |
| 149 | +class Neuron(dj.Imported): |
| 150 | + definition = """ |
| 151 | + -> Session |
| 152 | + neuron_id: int |
| 153 | + --- |
| 154 | + activity: longblob # electric activity of the neuron |
| 155 | + """ |
| 156 | + def make(self, key): # `make` takes a single argument `key` |
| 157 | + print('key is', key) |
| 158 | + |
| 159 | +# Now, let's call `populate` again! |
| 160 | + |
| 161 | +# ENTER YOUR CODE! - call `populate` on the table |
| 162 | + |
| 163 | + |
| 164 | +# When you call `populate` on an `Imported` table, this triggers DataJoint to look up all tables that the `Imported` table depends on. |
| 165 | +# |
| 166 | +# For **every unique combination of entries in the depended or "parent" tables**, DataJoint calls `make` function, passing in the primary key of the parent(s). |
| 167 | + |
| 168 | +# Because `Neuron` depends on `Session`, `Neuron`'s `make` method was called for each entry of `Session` |
| 169 | + |
| 170 | +Session() |
| 171 | + |
| 172 | + |
| 173 | +# Note that `make` only receives the *primary key attributes* of `Session` (`mouse_id` and `session_date`) but not the other attributes. |
| 174 | + |
| 175 | +# ## Implementing `make` |
| 176 | + |
| 177 | +# Now we have a better understanding of `make`, let's implement `make` to perform the importing of data from file. |
| 178 | + |
| 179 | +@schema |
| 180 | +class Neuron(dj.Imported): |
| 181 | + definition = """ |
| 182 | + -> Session |
| 183 | + neuron_id: int |
| 184 | + --- |
| 185 | + activity: longblob # electric activity of the neuron |
| 186 | + """ |
| 187 | + def make(self, key): |
| 188 | + # use key dictionary to determine the data file path |
| 189 | + data_file = "data/data_{mouse_id}_{session_date}.npy".format(**key) |
| 190 | + |
| 191 | + # load the data |
| 192 | + data = np.load(data_file) |
| 193 | + |
| 194 | + for idx, d in enumerate(data): |
| 195 | + # add the index of the 1st dimension as neuron_id |
| 196 | + key['neuron_id'] = idx |
| 197 | + |
| 198 | + # add the loaded data as the "activity" column |
| 199 | + key['activity'] = d |
| 200 | + |
| 201 | + # insert the key into self |
| 202 | + self.insert1(key) |
| 203 | + |
| 204 | + print('Populated neuron={neuron_id} for mouse_id={mouse_id} on session_date={session_date}'.format(**key)) |
| 205 | + |
| 206 | + |
| 207 | +# Notice that we added the missing attribute information `activity` into the `key` dictionary, and finally **inserted the entry** into `self` = `Neuron` table. The `make` method's job is to create and insert a new entry corresponding to the `key` into this table! |
| 208 | + |
| 209 | +# Finally, let's go ahead and call `populate` to actually populate the `Neuron` table, filling it with data loaded from data files! |
| 210 | + |
| 211 | +Neuron.populate() |
| 212 | + |
| 213 | +Neuron() |
| 214 | + |
| 215 | +# As you can obviously see, in these example datasets, we only have data for one neuron per session. |
| 216 | + |
| 217 | +# What happens if we call `Neuron.populate` again? |
| 218 | + |
| 219 | +Neuron.populate() |
| 220 | + |
| 221 | +# That's right - nothing! This makes sense, because we have imported `Neuron` for all entries in `Session` and nothing is left to be imported. |
| 222 | + |
| 223 | +# Now what happens if we insert a new entry into `Session`? |
| 224 | + |
| 225 | +Session.insert1({ |
| 226 | + "mouse_id": 100, |
| 227 | + "session_date": "2017-06-01", |
| 228 | + "experiment_setup": 1, |
| 229 | + "experimenter": "Jacob Reimer" |
| 230 | +}) |
| 231 | + |
| 232 | +# We can find all `Session` without corresponding `Neuron` entry with the **negative restriction operator** `-` |
| 233 | + |
| 234 | +# select all Session entries *without* a corresponding entry in Neuron |
| 235 | +Session - Neuron |
| 236 | + |
| 237 | +Neuron.populate() |
| 238 | + |
| 239 | +Neuron() |
| 240 | + |
| 241 | +# # Summary |
| 242 | + |
| 243 | +# Congratulations! You have successfully extended your pipeline with a table to represent recorded data (`Neuron` as `Imported` table), learned and implemented the `make()` and `populate()` call to load external data to your tables. |
| 244 | + |
| 245 | +dj.ERD(schema) |
| 246 | + |
| 247 | +# At this point, our pipeline contains the core elements with data populated, ready for further downstream analysis. |
| 248 | +# |
| 249 | +# In the next [session](./03-Computed%20Table%2C%20Lookup%20Table%2C%20and%20Part%20Table%20-%20Interactive.ipynb), we are going to introduce the concept of `Computed` table, and `Lookup` table, as well as learning to set up a automated computation routine. |
0 commit comments