Skip to content

Commit 2cb77fc

Browse files
authored
Merge pull request #7816 from systeminit/jhelwig/s3-error-handling
feat(layer-cache): add structured S3 error handling and observability
2 parents 4757dac + eacf027 commit 2cb77fc

File tree

6 files changed

+709
-103
lines changed

6 files changed

+709
-103
lines changed

dev/config/grafana/provisioning/dashboards/layer-cache-metrics.json

Lines changed: 172 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,15 @@
691691
"expr": "rate(layer_cache_persister_write_failed_permanent{exported_job=~\"$service\", cache_name=~\"$cache\"}[1m])",
692692
"legendFormat": "{{exported_job}}/{{cache_name}} - permanent",
693693
"refId": "B"
694+
},
695+
{
696+
"datasource": {
697+
"type": "prometheus",
698+
"uid": "${datasource}"
699+
},
700+
"expr": "rate(layer_cache_persister_write_failed_retryable{exported_job=~\"$service\", cache_name=~\"$cache\", backend=\"s3\", error_kind!=\"\"}[1m])",
701+
"legendFormat": "{{exported_job}}/{{cache_name}}/S3 - {{error_kind}}",
702+
"refId": "C"
694703
}
695704
],
696705
"fieldConfig": {
@@ -723,10 +732,159 @@
723732
}
724733
},
725734
{
726-
"id": 21,
735+
"id": 12,
727736
"gridPos": {
728737
"x": 0,
729738
"y": 48,
739+
"w": 24,
740+
"h": 8
741+
},
742+
"type": "timeseries",
743+
"title": "S3 Write Failures by Error Category",
744+
"datasource": {
745+
"type": "prometheus",
746+
"uid": "${datasource}"
747+
},
748+
"targets": [
749+
{
750+
"datasource": {
751+
"type": "prometheus",
752+
"uid": "${datasource}"
753+
},
754+
"expr": "rate(layer_cache_persister_write_failed_retryable{exported_job=~\"$service\", cache_name=~\"$cache\", backend=\"s3\", error_kind!=\"\"}[1m])",
755+
"legendFormat": "{{exported_job}}/{{cache_name}} - {{error_kind}}",
756+
"refId": "A"
757+
}
758+
],
759+
"fieldConfig": {
760+
"defaults": {
761+
"unit": "ops",
762+
"color": {
763+
"mode": "palette-classic"
764+
},
765+
"custom": {
766+
"lineWidth": 2,
767+
"fillOpacity": 10,
768+
"showPoints": "never",
769+
"stacking": {
770+
"mode": "normal"
771+
}
772+
}
773+
},
774+
"overrides": [
775+
{
776+
"matcher": {
777+
"id": "byRegexp",
778+
"options": "/.*authentication.*/"
779+
},
780+
"properties": [
781+
{
782+
"id": "color",
783+
"value": {
784+
"mode": "fixed",
785+
"fixedColor": "red"
786+
}
787+
}
788+
]
789+
},
790+
{
791+
"matcher": {
792+
"id": "byRegexp",
793+
"options": "/.*throttling.*/"
794+
},
795+
"properties": [
796+
{
797+
"id": "color",
798+
"value": {
799+
"mode": "fixed",
800+
"fixedColor": "orange"
801+
}
802+
}
803+
]
804+
},
805+
{
806+
"matcher": {
807+
"id": "byRegexp",
808+
"options": "/.*network.*/"
809+
},
810+
"properties": [
811+
{
812+
"id": "color",
813+
"value": {
814+
"mode": "fixed",
815+
"fixedColor": "yellow"
816+
}
817+
}
818+
]
819+
},
820+
{
821+
"matcher": {
822+
"id": "byRegexp",
823+
"options": "/.*not_found.*/"
824+
},
825+
"properties": [
826+
{
827+
"id": "color",
828+
"value": {
829+
"mode": "fixed",
830+
"fixedColor": "blue"
831+
}
832+
}
833+
]
834+
},
835+
{
836+
"matcher": {
837+
"id": "byRegexp",
838+
"options": "/.*configuration.*/"
839+
},
840+
"properties": [
841+
{
842+
"id": "color",
843+
"value": {
844+
"mode": "fixed",
845+
"fixedColor": "purple"
846+
}
847+
}
848+
]
849+
},
850+
{
851+
"matcher": {
852+
"id": "byRegexp",
853+
"options": "/.*other.*/"
854+
},
855+
"properties": [
856+
{
857+
"id": "color",
858+
"value": {
859+
"mode": "fixed",
860+
"fixedColor": "gray"
861+
}
862+
}
863+
]
864+
}
865+
]
866+
},
867+
"options": {
868+
"tooltip": {
869+
"mode": "multi",
870+
"sort": "desc"
871+
},
872+
"legend": {
873+
"displayMode": "table",
874+
"placement": "bottom",
875+
"calcs": [
876+
"lastNotNull",
877+
"max",
878+
"mean"
879+
]
880+
}
881+
}
882+
},
883+
{
884+
"id": 21,
885+
"gridPos": {
886+
"x": 0,
887+
"y": 56,
730888
"w": 12,
731889
"h": 8
732890
},
@@ -781,7 +939,7 @@
781939
"id": 23,
782940
"gridPos": {
783941
"x": 12,
784-
"y": 48,
942+
"y": 56,
785943
"w": 12,
786944
"h": 8
787945
},
@@ -836,7 +994,7 @@
836994
"id": 14,
837995
"gridPos": {
838996
"x": 0,
839-
"y": 56,
997+
"y": 64,
840998
"w": 12,
841999
"h": 8
8421000
},
@@ -908,7 +1066,7 @@
9081066
"id": 15,
9091067
"gridPos": {
9101068
"x": 12,
911-
"y": 56,
1069+
"y": 64,
9121070
"w": 12,
9131071
"h": 8
9141072
},
@@ -980,7 +1138,7 @@
9801138
"id": 16,
9811139
"gridPos": {
9821140
"x": 0,
983-
"y": 64,
1141+
"y": 72,
9841142
"w": 24,
9851143
"h": 8
9861144
},
@@ -1034,7 +1192,7 @@
10341192
"id": 17,
10351193
"gridPos": {
10361194
"x": 0,
1037-
"y": 72,
1195+
"y": 80,
10381196
"w": 24,
10391197
"h": 8
10401198
},
@@ -1088,7 +1246,7 @@
10881246
"id": 25,
10891247
"gridPos": {
10901248
"x": 0,
1091-
"y": 80,
1249+
"y": 88,
10921250
"w": 24,
10931251
"h": 8
10941252
},
@@ -1143,7 +1301,7 @@
11431301
"id": 26,
11441302
"gridPos": {
11451303
"x": 0,
1146-
"y": 88,
1304+
"y": 96,
11471305
"w": 24,
11481306
"h": 8
11491307
},
@@ -1216,7 +1374,7 @@
12161374
"id": 19,
12171375
"gridPos": {
12181376
"x": 0,
1219-
"y": 96,
1377+
"y": 104,
12201378
"w": 12,
12211379
"h": 8
12221380
},
@@ -1279,7 +1437,7 @@
12791437
"id": 20,
12801438
"gridPos": {
12811439
"x": 12,
1282-
"y": 96,
1440+
"y": 104,
12831441
"w": 12,
12841442
"h": 8
12851443
},
@@ -1342,7 +1500,7 @@
13421500
"id": null,
13431501
"gridPos": {
13441502
"x": 0,
1345-
"y": 104,
1503+
"y": 112,
13461504
"w": 24,
13471505
"h": 1
13481506
},
@@ -1354,7 +1512,7 @@
13541512
"id": null,
13551513
"gridPos": {
13561514
"x": 0,
1357-
"y": 105,
1515+
"y": 113,
13581516
"w": 8,
13591517
"h": 8
13601518
},
@@ -1393,7 +1551,7 @@
13931551
"id": null,
13941552
"gridPos": {
13951553
"x": 0,
1396-
"y": 113,
1554+
"y": 121,
13971555
"w": 24,
13981556
"h": 8
13991557
},
@@ -1462,7 +1620,7 @@
14621620
"id": null,
14631621
"gridPos": {
14641622
"x": 0,
1465-
"y": 121,
1623+
"y": 129,
14661624
"w": 8,
14671625
"h": 8
14681626
},

lib/dal-test/src/lib.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -349,14 +349,14 @@ impl Config {
349349
} = &config.object_storage_config.auth
350350
{
351351
let new_access_key =
352-
env::var(ENV_VAR_S3_ACCESS_KEY).unwrap_or_else(|_| access_key.clone());
352+
env::var(ENV_VAR_S3_ACCESS_KEY).unwrap_or_else(|_| access_key.to_string());
353353
let new_secret_key =
354-
env::var(ENV_VAR_S3_SECRET_KEY).unwrap_or_else(|_| secret_key.clone());
354+
env::var(ENV_VAR_S3_SECRET_KEY).unwrap_or_else(|_| secret_key.to_string());
355355

356356
config.object_storage_config.auth =
357357
si_layer_cache::s3::S3AuthConfig::StaticCredentials {
358-
access_key: new_access_key,
359-
secret_key: new_secret_key,
358+
access_key: new_access_key.into(),
359+
secret_key: new_secret_key.into(),
360360
};
361361
}
362362

0 commit comments

Comments
 (0)