datakind
diff --git a/‎src/webapp/.env.example‎
Lines changed: 7 additions & 4 deletions b/‎src/webapp/.env.example‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎src/webapp/README.md‎
Lines changed: 104 additions & 34 deletions b/‎src/webapp/README.md‎
Lines changed: 104 additions & 34 deletions
diff --git a/‎src/webapp/authn.py‎
Lines changed: 19 additions & 34 deletions b/‎src/webapp/authn.py‎
Lines changed: 19 additions & 34 deletions
diff --git a/‎src/webapp/authn_test.py‎
Lines changed: 0 additions & 4 deletions b/‎src/webapp/authn_test.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/webapp/config.py‎
Lines changed: 5 additions & 4 deletions b/‎src/webapp/config.py‎
Lines changed: 5 additions & 4 deletions
@@ -15,9 +15,11 @@ DB_CERT=""
 DB_ROOT_CERT=""
 DB_KEY=""
 
-# The initial user credentials for DEV (facilitates development)
-DEV_INIT_DB_PASSWORD="<PUT PASSWORD HERE>"
-DEV_INIT_DB_USER="tester@datakind.org"
+# Generate the following using `openssl rand -hex 32`
+# This is the initial API key for one-time use during setup.
+INITIAL_API_KEY=""
+# Its corresponding ID. You can generate this using uuid4 e.g. uuid.uuid4() in python3
+INITIAL_API_KEY_ID=""
 
 # GCP related env vars
 GCP_REGION="us-east4"
@@ -32,4 +34,5 @@ DATABRICKS_HOST_URL=""
 DATABRICKS_SERVICE_ACCOUNT_EMAIL=""
 
 # Datakinders allowed to issue API key. This should be the MINIMUM set. Keep this group small. Pass as a comma separated string structured like so: "abc@dk.org,bcd@dk.org"
-API_KEY_ISSUERS="tester@datakind.org"
+# The initial value set is api_key_initial which is the initial API key, this is needed for one-time setup. You can remove this once the api key table is populated.
+API_KEY_ISSUERS="api_key_initial"
@@ -1,22 +1,84 @@
-Notes: 
+# Overview of the REST API for SST
 
-REST API for SST functionality.
+## See Swagger UI for self-documentation of all currently available API endpoints
 
-Notes:
-### API Callers
+Go to `<env>-sst.datakind.org/api/v1/docs`: e.g. https://dev-sst.datakind.org/api/v1/docs
 
-API callers will need to create a user using the backend and then generate an API token. They will also need the GCloud upload auth token.
+Note that dev and staging links are all behind a GCP Identity-Aware Proxy.
 
-### Prerequisites
+## Authentication
 
-In order to work with and test GCS related functionality, you'll need to setup default credentials:
-https://cloud.google.com/docs/authentication/set-up-adc-local-dev-environment#local-user-cred
+Authentication of the API is primarily via JWTs in the Authorization header of the HTTP calls. These JWTs are short-term tokens that expire and are signed by the API. They are slightly more secure than using API keys directly to authenticate every call as API keys are long-term/non-expiring authentication tokens which are more powerful if stolen. So the mechanism used here is that API keys are exchanged for JWTs, which are then used to authenticate each call and can store additional information such as "enduser" identity.
 
-You will also need to add the permission Storage Writer or Storage Admin to your Datakind Account in GCP to allow for local interaction with the storage buckets.
+Note that intentionally, there is no way to use the user table's email/password combination to authenticate to the API directly. An API key is required. This means that a "user" is almost only a frontend concept. However, we do have to retain access to the user table in the backend to look up access type/institution and such. See more in the databases section below.
 
-Note that to generate GCP URLS you'll need a service account key (doesn't work locally).
+There are also multiple types of API keys, and they can have the same access types as a user. Additionally, API keys can have the additional attribute of "allows_enduser". BE CAUTIOUS WHEN SETTING THIS TO TRUE when generating API keys. This means that this API key can impersonate any user. This API key should really only be used for the frontend, which needs to do enduser impersonation. Note that only DATAKINDER access types can allow endusers. 
 
-### For local testing:
+Additionally, API keys, like users, can have an institution set. DATAKINDER type keys should not set an institution.
+
+### Authenticate/Generating Tokens via the Swagger UI
+
+NOTE: Treat keys as secrets. These will grant full access to this API.
+
+0. Get a valid API key. In the LOCAL environment, you can use the `INITIAL_API_KEY` set in your `.env` file or `key_1`. In other environments, you can use `INITIAL_API_KEY` set in the `.env` file or any existing generated API keys.
+1. Hit the authorize button on the top right, enter a valid API key in the `api-key` field (ignore the `api_key_scheme` which contains username/password -- this is to allow FastAPI to auto populate bearer tokens generated from the API keys; the actual username/password fields intentionally do not work).
+2. Then generate a token using the `/token-from-api-key` POST method. This will be the only endpoint you can access with the API key directly.
+3. Get the resulting token value which you can then use to CURL to any endpoint. For example, 
+
+```
+$ curl -X 'GET' \
+  'http://127.0.0.1:8000/api/v1/non-inst-users' \
+  -H 'Authorization: Bearer <paste_the_token_here>'
+```
+
+In the long-term, look into a way to have the API key --> token conversion be handled directly by FastAPI so that the Swagger UI can do the conversion directly and you won't have to curl with your token.
+
+## Databases
+
+All data is stored in MySQL databases (for dev/staging/prod, these are databases in GCP's Cloud SQL), the main file you'll want to look at is [src/webapp/database.py](https://github.com/datakind/sst-app-api/blob/develop/src/webapp/database.py).
+
+At time of writing, the databases the API cares about and tracks, are as follows:
+
+* Institution Table ("inst"): the institutions, including info about them like PDP ID if applicable, creator/creation time, etc.
+* API Key Table ("apikey"): the API keys including access type, valid status (you can disable a key), etc.
+* Account Table ("users"): **THIS TABLE IS (the only table) SHARED WITH THE FRONTEND**. This contains enduser email/password, access types, inst if applicable etc. Because this table is shared with the frontend, any changes to the table definition should be reflected in both the ORM handling the table in the frontend _and_ the backend. Note that intentionally, there's no way to create new users from the backend. This is because the backend only uses API keys to authenticate and also lacks some reqiured fields such as team id generation that is required by Laravel to use the user table. The frontend can directly create users in the table which the backend will be able to read.
+* Account History Table ("account_history"): audit trail of certain events undertaken by users. TODO: interactions with this table largely remain unimplemented.
+* File Table ("file"): tracks files
+* Batch Table ("batch"): tracks batches
+* Model Table ("model"): tracks models
+* Job Table ("job"): tracks Databricks jobs, storing the per-run unique job_run_id. Status of the job is also partially tracked here. Note that failed jobs are currently indistinguishable from incomplete jobs. 
+
+NOTE: naming convention is to use a singular descriptor for the table name, however, the "users" table has to follow Laravel's table naming convention, which has the users table called "users".
+
+## Testing
+
+Unit test files are named `<file_under_test>_test.py` to correspond with the files they are testing. Unit tests only test behavior introduced by logic written in those files and do not test any integration with other systems. To respect test isolation, we have the following levels of testing:
+
+1. Unit tests where all other systems are mocked out (e.g. Databricks, GCP storage etc.)
+2. Dev environment with fake data to test integration on real systems, as all integration points are connected to the real endpoints in Databricks, in GCP etc.
+3. Staging environment with real data (potentially sampled if your datasets are large) to test real data flowing through the full end to end setup of a real system that mimics prod.
+4. Prod environment with real data on real systems.
+
+
+That means for functions that are mainly doing integration pieces, we do not have unit tests for them, as we assume external systems work and mocking and testing these integration points would be near-useless. These can be tested at level 2 in the dev environment, which is setup for just this purpose.
+
+This also means, it's not recommended for the local environment to connect to the dev environment. The four environments, `local`, `dev`, `staging`, `prod`, should also be isolated from each other.
+
+While working in the local environment, it's recommended you mock out/stub out the calls to external systems. If you don't want to do that feel free to look into official documentation on how to auth to GCS and to Databricks from your local environment.
+
+### Comment on Deployment
+
+* Dev environment: gets deployed upon any new commit to b/develop.
+* Staging environment: requires manual Cloud Build Trigger Run initiated by a human to pick up the most recent changes from b/develop.
+* Prod environment: requires manual Cloud Build Trigger Run initiated by a human to pick up the most recent changes from b/develop.
+
+For more information on deployment, see the Terraform setup and the GCP setup in the GCP console.
+
+## Package Management
+
+Package management is done via [uv](https://docs.astral.sh/uv/). When adding a new package, add it according to the uv documents and keep the `uv.lock` and `pyproject.toml` files up to date.
+
+## Local Environment Setup
 
 Enter into the root directory of the repo.
 
@@ -25,47 +87,55 @@ Enter into the root directory of the repo.
 1. `source .venv/bin/activate`
 1. `pip install uv`
 1. `uv sync --all-extras --dev`
-1. `coverage run -m pytest  -v -s ./src/webapp/`
 
-For integration wtih Databricks, run:
+You're now in your virtual env with all your dependencies added.
 
-The workspace URL will look like `https://<some_id>.gcp.databricks.com`
+For all of the following, the steps above are pre-requisites and you should be in the root folder of `sst-app-api/`.
 
-1. `databricks auth login --host <workspace_url>`
+### Spin up the app locally:
 
-For all of the following, be in the repo root folder (`sst-app-api/`).
+1. `export ENV_FILE_PATH=<full_path_to_your_webapp_.env_file>` 
+1. `fastapi dev src/webapp/main.py --port 8000`
+1. Go to `http://127.0.0.1:8000/api/v1/docs`
+1. Hit the `Authorize` button on the top right and enter `key_1` in the `api-key` field (scroll past/ignore `api_key_scheme` fields).
+1. Generate a token using the `/token-from-api-key` POST method.
+1. Use the token in to CURL to any endpoint. For example, 
 
-If you need to generate signed URLs to upload data to GCS you should impersonate a service account.
-You can use the [cloud run service account](https://console.cloud.google.com/iam-admin/iam) or create
-your own with the desired permissions.
+```
+$ curl -X 'GET' \
+  'http://127.0.0.1:8000/api/v1/non-inst-users' \
+  -H 'Authorization: Bearer <paste_the_token_here>'
+```
 
-1. `gcloud auth application-default login --impersonate-service-account <service-account-email>`
+### Before committing, run the formatter and run the unit tests
 
-Spin up the app locally:
+1. Formatter: `black src/webapp/.`
+1. Unit tests: `coverage run -m pytest  -v -s ./src/webapp/`
 
-1. `ENV_FILE_PATH='/full/path/to/.env' fastapi dev src/webapp/main.py`
-1. Go to `http://127.0.0.1:8000/docs`
-1. Hit the `Authorize` button on the top right and enter the tester credentials:
+#### Optionally run pylint
 
-* username: `tester@datakind.org`
-* password: `tester_password`
+`uv run pylint './src/webapp/*' --errors-only` for only errors.
 
-Before committing, make sure to run:
+Non-error Pylint is very opinionated, and **SOMETIMES WRONG**. For example, there exist warnings to switch `== NONE` to `is None` for SQL query where clauses. THIS WILL CAUSE THE SQL QUERY TO NOT WORK -- (it appears to be due to how SqlAlchemy understands the clauses). So be careful when following the recommendations from pylint.
 
-1. `black src/webapp/.`
-1. Test using `coverage run -m pytest  -v -s ./src/webapp/*.py`
-1. Test using `coverage run -m pytest  -v -s ./src/webapp/routers/*.py`
+## Usage Notes
 
-### Notes:
+Some general things that may be helpful to call out.
 
-postgresql requires that SSL certs be 0600 or 0640 depending on group/owners. The way we configure the 
+### Adding a Datakinder vs an institutional user
 
 The flow to add a Datakinder user is different from adding a user to an institution:
+
 * adding a user to an institution has to happen prior to that user creating an account (by allowlisting their email for a given institution)
 * adding a Datakinder user has to happen after the Datakinder person has already created their account, then their account's access type is updated.
 
-In general, the service account used to run this service in GCP will also need to be granted Databricks access in the equivalent environment.
+### Uploading files
+
+The process to upload a file involves three API calls:
+1. Get the GCS upload URL: `GET /institutions/{inst_id}/upload-url/{file_name}`
+1. Post to the GCS upload URL: `POST <the_gcp_url_returned_from_step_1>`
+1. Validate the file: `POST /institutions/{inst_id}/input/validate-upload/{file_name}` OR `POST /institutions/{inst_id}/input/validate-sftp/{file_name}` -- depending on what input mechanism your file used. This sets a field in the File database table which indicates the source of the file (`MANUAL_UPLOAD` etc.) which is helpful information for the frontend.
 
-### Local VSCode Debugging
+## Local VSCode Debugging
 
 From the Run & Debug panel (⇧⌘D on 🍎) you can run the [debug launch config](../../.vscode/launch.json) for the webapp or worker modules. This will allow you to set breakpoints within the source code while the applications are running.
@@ -2,30 +2,20 @@
 Functions related to authentication.
 """
 
+from datetime import timedelta, datetime, timezone
 import jwt
-
 from fastapi import Security, HTTPException, status
 from fastapi.security import (
     OAuth2PasswordBearer,
-    OAuth2PasswordRequestForm,
     APIKeyHeader,
 )
 from passlib.context import CryptContext
 from pydantic import BaseModel
-from datetime import timedelta, datetime, timezone
 from .config import env_vars
 
 
 pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
 
-oauth2_scheme = OAuth2PasswordBearer(
-    scheme_name="user_scheme",
-    tokenUrl="token",
-    # We are using scope to sideload info on the end user. So "enduser" here is just a placeholder,
-    # but the actual username will be passed by the frontend.
-    scopes={"enduser": "end user to act as (a valid username), if frontend"},
-)
-
 oauth2_apikey_scheme = OAuth2PasswordBearer(
     scheme_name="api_key_scheme",
     tokenUrl="token-from-api-key",
@@ -43,59 +33,54 @@
 
 
 class Token(BaseModel):
+    """Info stored in the JWT."""
+
     access_token: str
     token_type: str
 
 
-class TokenData(BaseModel):
-    username: str | None = None
-
-
 def get_api_key(
-    api_key_header: str = Security(api_key_header),
-    api_key_inst_header: str = Security(api_key_inst_header),
-    api_key_enduser_header: str = Security(api_key_enduser_header),
-) -> str:
-    """Retrieve the api key and enduser header key if present.
-
-    Args:
-        api_key_header: The API key passed in the HTTP header.
-
-    Returns:
-        A tuple with the api key and enduser header if present. Authentication happens elsewhere.
-    Raises:
-        HTTPException: If the API key is invalid or missing.
-    """
-    if api_key_header:
-        return (api_key_header, api_key_inst_header, api_key_enduser_header)
+    api_key: str = Security(api_key_header),
+    api_key_inst: str = Security(api_key_inst_header),
+    api_key_enduser: str = Security(api_key_enduser_header),
+) -> tuple[str, str, str]:
+    """Retrieve the api key and enduser header key if present."""
+    if api_key:
+        return (api_key, api_key_inst, api_key_enduser)
     raise HTTPException(
         status_code=status.HTTP_401_UNAUTHORIZED,
         detail="Invalid or missing API Key",
     )
 
 
 def verify_password(plain_password: str, hashed_password: str) -> bool:
+    """Verify a plain password against a hash. Includes a 2y/2b replacement since Laravel
+    Generates hashes that start with 2y. The hashing scheme recognizes both."""
     revert_hash = hashed_password.replace("$2y", "$2b", 1)
     return pwd_context.verify(plain_password, revert_hash)
 
 
 def verify_api_key(plain_api_key: str, hashed_key: str) -> bool:
+    """Verify a plain API Key against a hash."""
     return pwd_context.verify(plain_api_key, hashed_key)
 
 
 def get_api_key_hash(api_key: str):
+    """Hash a given api key."""
     return pwd_context.hash(api_key)
 
 
 def get_password_hash(password: str):
-    # to align with the password hashing used by Laravel, we have to replace the 2b
-    # generated by pwd_context with 2y and that should be the version we store.
-    # They should be functionally the same: https://stackoverflow.com/a/36225192/28478909
+    """Hash a password. To align with the password hashing used by Laravel, we have to replace the 2b
+    generated by pwd_context with 2y and that should be the version we store.
+    They should be functionally the same: https://stackoverflow.com/a/36225192/28478909
+    """
     initial_hash = pwd_context.hash(password)
     return initial_hash.replace("$2b", "$2y", 1)
 
 
 def create_access_token(data: dict, expires_delta: timedelta | None = None):
+    """Create a JWT."""
     to_encode = data.copy()
     if expires_delta:
         expire = datetime.now(timezone.utc) + expires_delta
 
@@ -1,9 +1,5 @@
 """Test file for authn.py."""
 
-import pytest
-
-from fastapi import HTTPException
-import uuid
 from .authn import (
     get_password_hash,
     verify_password,
 
@@ -11,6 +11,8 @@
     "ACCESS_TOKEN_EXPIRE_MINUTES": "120",
     # The Issuers env var will be stored as an array of emails.
     "API_KEY_ISSUERS": [],
+    "INITIAL_API_KEY": "",
+    "INITIAL_API_KEY_ID": "",
 }
 
 # The INSTANCE_HOST is the private IP of CLoudSQL instance e.g. '127.0.0.1' ('172.17.0.1' if deployed to GAE Flex)
@@ -50,8 +52,8 @@
 }
 
 
-# Setup function to get environment variables. Should be called at startup time.
 def startup_env_vars():
+    """Setup function to get environment variables. Should be called at startup time."""
     env_file = os.environ.get("ENV_FILE_PATH")
     if not env_file:
         raise ValueError(
@@ -81,8 +83,7 @@ def startup_env_vars():
                 "ENV environment variable not one of: PROD, STAGING, DEV, LOCAL."
             )
         if (
-            name == "ACCESS_TOKEN_EXPIRE_MINUTES"
-            or name == "ACCESS_TOKEN_EXPIRE_MINUTES"
+            name in ("ACCESS_TOKEN_EXPIRE_MINUTES", "ACCESS_TOKEN_EXPIRE_MINUTES")
         ) and not env_var.isdigit():
             raise ValueError(
                 "ACCESS_TOKEN_EXPIRE_MINUTES and ACCESS_TOKEN_EXPIRE_MINUTES environment variables must be an int."
@@ -111,8 +112,8 @@ def startup_env_vars():
             databricks_vars[name] = env_var
 
 
-# Setup function to get db environment variables. Should be called at db startup time.
 def setup_database_vars():
+    """Setup function to get db environment variables. Should be called at db startup time."""
     global engine_vars
     for name in engine_vars:
         env_var = os.environ.get(name)