Skip to content

Commit bb60d6b

Browse files
authored
manage terraform scripts for Azure staging environment (#17)
* Create DEPLOYMENT.md * Update DEPLOYMENT.md * copy over scripts from infra-modules into deploy/terraform * update instructions * remove copied readme
1 parent 7d1d773 commit bb60d6b

File tree

8 files changed

+205
-5
lines changed

8 files changed

+205
-5
lines changed

DEPLOYMENT.md

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# TLM Deployment Guide
2+
3+
## Requirements
4+
5+
- Terraform CLI
6+
```
7+
brew install terraform
8+
```
9+
- Azure CLI
10+
```
11+
brew install azure-cli
12+
```
13+
- Helm
14+
```
15+
brew install helm
16+
```
17+
- Azure account in Cleanlab's tenant
18+
- ask Kelsey if you need to be invited
19+
- Minimum required Azure role assignments:
20+
- `Reader` on the OpenAI service resource group `tlm-staging-rg`
21+
- `Storage Blob Data Reader` on the `tlmtfstate` storage account
22+
23+
Below are optional Azure roles to request on the `Production` subscription. Ask Kelsey if you want permissions.
24+
* `Reader` for global read access
25+
* `Azure Kubernetes Service Contributor` for access to live/historical container logs, `kubectl` usage, and more
26+
27+
## Initial Setup
28+
29+
Import the existing resources to Terraform:
30+
```
31+
terraform import -var-file="staging.tfvars" 'module.app.azurerm_role_assignment.openai_identity_sa' "/subscriptions/a47bf188-5236-4db5-bde5-16655f9d07ec/resourceGroups/tlm-staging-rg/providers/Microsoft.CognitiveServices/accounts/tlm-openai/providers/Microsoft.Authorization/roleAssignments/5c088622-3f50-84c7-e52e-09d253ed0325"
32+
33+
terraform import -var-file="staging.tfvars" 'module.app.helm_release.this' tlm/tlm
34+
```
35+
36+
## Deploying to Azure staging environment
37+
38+
Follow these instructions to deploy TLM app changes through CLI. Note that after merging to main, you must wait for the `Release TLM App / build-push-chat-backend-acr` step to finish before starting the deployment process.
39+
40+
1. Run `az login` and select the `Production` subscription
41+
2. Change your working directory: `cd deploy/terraform/app`
42+
3. Set up your Terraform variables:
43+
44+
* If this is your first time deploying, `cp staging.tfvars.example staging.tfvars`
45+
* Modify the `.tfvars` file by setting `app_image_tag` to the Git commit SHA that you want to deploy
46+
47+
4. `terraform init`
48+
5. `terraform plan -var-file="staging.tfvars" -out=tfplan.plan`
49+
50+
The plan should be `0 to add, 1 to change, 0 to destroy`. It will only include modifying the Helm release resource `module.app.helm_release.this` to update the `chat_backend.image.tag`, plus modifying some metadata. If anything else is included in the plan, check that your branch has the latest version of the Terraform script, or reach out in [#azure](https://cleanlabinc.slack.com/archives/C093X788A6L).
51+
52+
6. If the plan looks good, apply the changes by running `terraform apply tfplan.plan`.
53+
54+
Note that this will **NOT** prompt you to confirm the changes because they were already saved by the planning step, so be careful!
55+
56+
## Debugging
57+
58+
If you encounter permission issues (4xx status codes) when running any of the `terraform` commands, try unsetting the `ARM_*` environment variables. These can interfere with Azure-related authentication when managing resources.
59+
60+
```
61+
unset ARM_CLIENT_ID ARM_CLIENT_SECRET ARM_TENANT_ID ARM_SUBSCRIPTION_ID
62+
```

deploy/terraform/app/main.tf

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ locals {
1111
environment = "staging"
1212
resource_group_name = "tlm-staging-rg"
1313
subscription_id = "a47bf188-5236-4db5-bde5-16655f9d07ec"
14-
location = "eastus"
14+
location = "eastus2"
1515
tags = {
1616
environment = local.environment
1717
project = "tlm"
@@ -32,8 +32,6 @@ resource "azurerm_resource_group" "this" {
3232

3333
provider "azurerm" {
3434
features {}
35-
use_cli = false
36-
use_msi = false
3735

3836
subscription_id = local.subscription_id
3937
}
@@ -70,8 +68,11 @@ module "app" {
7068
image_pull_username = data.terraform_remote_state.infra.outputs.acr_image_pull_app_client_id
7169
image_pull_password = data.terraform_remote_state.infra.outputs.acr_image_pull_app_password
7270

73-
default_completion_model = "azure/gpt-4o-mini"
74-
default_embedding_model = "azure/text-embedding-3-small"
71+
default_completion_model = "azure/gpt-4.1-mini"
72+
lowest_latency_model = "azure/gpt-4.1-nano"
73+
74+
enable_external_access = true
75+
model_config_file_path = "models.json"
7576

7677
tags = local.tags
7778
}

deploy/terraform/app/models.json

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"gpt-4o": {
3+
"api_base": "https://azure-cognitive-fegxs.openai.azure.com/",
4+
"api_version": "2024-11-20"
5+
},
6+
"gpt-4o-mini": {
7+
"api_base": "https://azure-cognitive-fegxs.openai.azure.com/",
8+
"api_version": "2024-07-18"
9+
},
10+
"gpt-4.1-mini": {
11+
"api_base": "https://azure-cognitive-fegxs.openai.azure.com/",
12+
"api_version": "2025-04-14"
13+
},
14+
"gpt-4.1-nano": {
15+
"api_base": "https://azure-cognitive-fegxs.openai.azure.com/",
16+
"api_version": "2025-04-14"
17+
}
18+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
environment = "staging"
2+
app_image_tag = ""

deploy/terraform/infra/backend.tf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
terraform {
2+
backend "azurerm" {
3+
resource_group_name = "tfstate-rg"
4+
storage_account_name = "tlmtfstate"
5+
container_name = "tfstate"
6+
key = "tlm/infra/terraform.tfstate"
7+
}
8+
}

deploy/terraform/infra/main.tf

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
provider "azurerm" {
2+
features {}
3+
subscription_id = "a47bf188-5236-4db5-bde5-16655f9d07ec"
4+
}
5+
6+
locals {
7+
environment = "staging"
8+
}
9+
10+
module "infra" {
11+
source = "git::https://github.com/cleanlab/infra-modules.git//tlm/infra"
12+
# source = "../../../tlm/infra"
13+
14+
environment = local.environment
15+
entity = "your-company-name"
16+
location = "eastus2"
17+
18+
create_openai_service = true
19+
openai_deployments = {
20+
"gpt-4o-mini" = {
21+
name = "gpt-4o-mini"
22+
model = "gpt-4o-mini"
23+
version = "2024-07-18"
24+
format = "OpenAI"
25+
scale = "GlobalStandard"
26+
capacity = 200
27+
}
28+
"gpt-4o" = {
29+
name = "gpt-4o"
30+
model = "gpt-4o"
31+
version = "2024-11-20"
32+
format = "OpenAI"
33+
scale = "GlobalStandard"
34+
capacity = 50
35+
}
36+
"gpt-4.1-mini" = {
37+
name = "gpt-4.1-mini"
38+
model = "gpt-4.1-mini"
39+
version = "2025-04-14"
40+
format = "OpenAI"
41+
scale = "GlobalStandard"
42+
capacity = 200
43+
}
44+
"gpt-4.1-nano" = {
45+
name = "gpt-4.1-nano"
46+
model = "gpt-4.1-nano"
47+
version = "2025-04-14"
48+
format = "OpenAI"
49+
scale = "GlobalStandard"
50+
capacity = 200
51+
}
52+
"text-embedding-3-small" = {
53+
name = "text-embedding-3-small"
54+
model = "text-embedding-3-small"
55+
version = "1"
56+
format = "OpenAI"
57+
scale = "Standard"
58+
capacity = 50
59+
}
60+
}
61+
62+
create_imagepull_app_registration = true
63+
64+
tags = {
65+
environment = local.environment
66+
project = "tlm"
67+
terraform = "true"
68+
}
69+
}

deploy/terraform/infra/outputs.tf

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
output "kube_host" {
2+
value = module.infra.kube_host
3+
sensitive = true
4+
}
5+
6+
output "kube_client_certificate" {
7+
value = module.infra.kube_client_certificate
8+
sensitive = true
9+
}
10+
11+
output "kube_client_key" {
12+
value = module.infra.kube_client_key
13+
sensitive = true
14+
}
15+
16+
output "kube_cluster_ca_certificate" {
17+
value = module.infra.kube_cluster_ca_certificate
18+
sensitive = true
19+
}
20+
21+
output "cluster_oidc_issuer_url" {
22+
value = module.infra.cluster_oidc_issuer_url
23+
}
24+
25+
output "acr_image_pull_app_client_id" {
26+
value = module.infra.acr_image_pull_app_client_id
27+
}
28+
29+
output "acr_image_pull_app_password" {
30+
value = module.infra.acr_image_pull_app_password
31+
sensitive = true
32+
}
33+
34+
output "openai_service_name" {
35+
value = module.infra.openai_service_name
36+
}
37+
38+
output "openai_service_resource_group_name" {
39+
value = module.infra.openai_service_resource_group_name
40+
}

deploy/terraform/infra/variables.tf

Whitespace-only changes.

0 commit comments

Comments
 (0)