Skip to content

Commit e208728

Browse files
Outage blog post (#1058)
* feat: Outage post + blog additions (author, etc) * name change * line about studio mode * feat: layout changes/formatting * feat: Author types + linting
1 parent e552530 commit e208728

File tree

9 files changed

+195
-6
lines changed

9 files changed

+195
-6
lines changed

apps/web/app/(site)/blog/[slug]/page.tsx

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import type { Metadata } from "next";
44
import Image from "next/image";
55
import { notFound } from "next/navigation";
66
import { MDXRemote } from "next-mdx-remote/rsc";
7+
import { AuthorByline } from "@/components/blog/AuthorByline";
78
import { BlogTemplate } from "@/components/blog/BlogTemplate";
89
import { ReadyToGetStarted } from "@/components/ReadyToGetStarted";
910
import { getBlogPosts } from "@/utils/blog";
@@ -81,11 +82,11 @@ export default async function PostPage({ params }: PostProps) {
8182

8283
return (
8384
<>
84-
<article className="px-5 py-32 mx-auto md:py-40 prose">
85+
<article className="px-5 py-24 mx-auto md:py-40 prose">
8586
{post.metadata.image && (
86-
<div className="relative mb-12 h-[345px] w-full">
87+
<div className="relative mb-6 h-[200px] sm:h-[280px] md:h-[345px] w-full rounded-lg overflow-hidden">
8788
<Image
88-
className="object-contain m-0 w-full rounded-lg sm:object-cover"
89+
className="object-contain m-0 w-full sm:object-cover"
8990
src={post.metadata.image}
9091
alt={post.metadata.title}
9192
fill
@@ -114,6 +115,9 @@ export default async function PostPage({ params }: PostProps) {
114115
</header>
115116
<hr className="my-6" />
116117
<MDXRemote source={post.content} />
118+
{"author" in post.metadata && post.metadata.author && (
119+
<AuthorByline authors={post.metadata.author} />
120+
)}
117121
<Share
118122
post={post}
119123
url={`${buildEnv.NEXT_PUBLIC_WEB_URL}/blog/${post.slug}`}

apps/web/app/Layout/Intercom/Client.tsx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@ export function Client(props: { hash?: string }) {
99
const user = use(useAuthContext().user);
1010
const pathname = usePathname();
1111
const isSharePage = pathname?.startsWith("/s/");
12+
const isBlogPage = pathname?.startsWith("/blog");
1213

1314
useEffect(() => {
14-
if (!isSharePage) {
15+
if (!isSharePage && !isBlogPage) {
1516
if (props.hash && user) {
1617
Intercom({
1718
app_id: "efxq71cv",
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import Image from "next/image";
2+
import Link from "next/link";
3+
import { parseAuthors } from "@/utils/authors";
4+
5+
interface AuthorBylineProps {
6+
authors: string;
7+
}
8+
9+
export function AuthorByline({ authors }: AuthorBylineProps) {
10+
const authorList = parseAuthors(authors);
11+
12+
if (authorList.length === 0) {
13+
return null;
14+
}
15+
16+
return (
17+
<div className="mt-16 pt-8 border-t border-gray-200">
18+
<div className="flex flex-wrap gap-1 sm:gap-6">
19+
{authorList.map((author, index) => (
20+
<div key={author.name} className="flex items-center space-x-3">
21+
<Image
22+
src={author.image}
23+
alt={author.name}
24+
width={48}
25+
height={48}
26+
className="w-10 h-10 rounded-full object-cover"
27+
/>
28+
<div>
29+
<div className="font-medium text-gray-900">{author.name}</div>
30+
<Link
31+
href={`https://x.com/${author.handle}`}
32+
target="_blank"
33+
rel="noopener noreferrer"
34+
className="text-sm text-blue-600 hover:text-blue-700 transition-colors"
35+
>
36+
@{author.handle}
37+
</Link>
38+
</div>
39+
</div>
40+
))}
41+
</div>
42+
</div>
43+
);
44+
}

apps/web/components/pages/UpdatesPage.tsx

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ import {
88
} from "@/utils/blog-registry";
99
import { generateGradientFromSlug } from "@/utils/gradients";
1010

11-
const FEATURED_SLUGS = ["handling-a-stripe-payment-attack", "cap-v03-launch"];
11+
const FEATURED_SLUGS = [
12+
"handling-a-stripe-payment-attack",
13+
"september-23-outage-deep-dive",
14+
];
1215

1316
export const UpdatesPage = () => {
1417
const allUpdates = getBlogPosts() as BlogPost[];
@@ -38,7 +41,7 @@ export const UpdatesPage = () => {
3841
});
3942

4043
return (
41-
<div className="py-32 md:py-40 wrapper wrapper-sm">
44+
<div className="pt-24 pb-32 md:py-40 wrapper wrapper-sm">
4245
{featuredPosts.length > 0 && (
4346
<div className="mb-6">
4447
<div className="grid grid-cols-1 gap-6 md:grid-cols-2">
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
---
2+
title: A deep dive into our September 23rd outage and what we changed
3+
description: A detailed account of how an IaC change removed our Vercel project, how we restored service, and the concrete fixes we have made to prevent a repeat.
4+
publishedAt: "2025-09-23"
5+
category: Technical
6+
image: /blog/deep-dive.jpg
7+
author: Richie McIlroy, Brendan Allan
8+
tags: Security, Outage, Deep Dive
9+
---
10+
11+
On 23rd September we had our first major outage. We sincerely apologize for the service disruption. This post explains what happened, how we fixed it, and what we are changing. We want this to be useful for users and for other teams running on Vercel and SST.
12+
13+
## Summary
14+
15+
- A local Infrastructure as Code change in SST referenced our Vercel project as a new resource instead of retrieving the existing one.
16+
- During `sst deploy` the Vercel project that serves cap.so was removed, which took down marketing pages, the dashboard, video sharing pages, and APIs used by Cap Desktop.
17+
- We rebuilt the project, restored environment variables, and reattached custom domains. Service was mostly back after 90 minutes.
18+
- Uploads from desktop stayed broken for another 30 minutes due to a www redirect that stripped `Authorization` headers. Removing that behavior brought us back to full health.
19+
20+
## Impact
21+
22+
- **Duration:** 2:30 pm to 4:30 pm AWST for full restoration, with partial restoration at 4:00 pm.
23+
- **Surface area:** cap.so marketing site, dashboard, video sharing, and desktop upload APIs.
24+
- **Customer effect:** broken links and failed API calls from Cap Desktop during the incident window.
25+
26+
## Timeline
27+
28+
Times in AWST.
29+
30+
- **2:30 pm**: While configuring IaC in SST, we changed our Vercel linkage from declaration to what we thought was a retrieval pattern.
31+
- **Shortly after**: `sst deploy` removed the Vercel project that hosts cap.so, taking down web and API surfaces.
32+
- **~2:40 pm**: We recognized the removal and created a new Vercel project, then began restoring environment variables and domains.
33+
- **~3:10 pm to 3:50 pm**: Restored required env vars from Bitwarden and SST, reconnected custom domains, redeployed.
34+
- **4:00 pm**: Most of the app was up again, uploads from desktop still failing.
35+
- **4:30 pm**: Found a redirect from `cap.so` to `www.cap.so` that swallowed `Authorization` headers. Fixed the routing and header handling. 100 percent of cap.so back online.
36+
37+
## Technical details
38+
39+
### The IaC change
40+
41+
The goal for the day was to stop touching production by hand and to add a staging environment. While moving toward that, our SST code switched from a project declaration to what looked like a reference to an existing Vercel project.
42+
43+
```ts
44+
/// in config()
45+
{
46+
removal: "retain";
47+
}
48+
49+
/// in app()
50+
// what we had
51+
new vercel.Project("VercelProject", { name: "cap-web" });
52+
53+
// what we expected to use across stages
54+
vercel.getProjectOutput({ name: "cap-web" });
55+
```
56+
57+
Two things mattered:
58+
59+
1. **Resource mode**: The first form is a declaration. If the stack believes it owns the lifecycle, removal events can propagate.
60+
2. **Removal behavior**: We relied on `removal: "retain"`, which does not fully protect nested resources. In hindsight we should have used `removal: "retain-all"` for anything that touches production resources.
61+
62+
During `sst deploy` the Vercel project was deleted. That removed the hosting for marketing, dashboard, video sharing, and the API.
63+
64+
### Rebuild and environment restore
65+
66+
We immediately created a new Vercel project. Critical environment variables were stored in Bitwarden and in SST. We restored those, then reattached customer domains and redeployed.
67+
68+
### Why uploads kept failing
69+
70+
After most pages returned, uploads from Cap Desktop still failed. We had configured cap.so to www.cap.so as recommended by Vercel, but this had a side-effect of causing the Authorization header to be stripped from any requests to cap.so. The client sent the header, the redirect hop discarded it, and the target saw an unauthenticated request. We removed this redirect and video uploads began working again.
71+
72+
## What went wrong
73+
74+
- A local IaC run had the ability to mutate a production resource.
75+
- We treated `removal: "retain"` as a safety net. It was not sufficient for this stack.
76+
- A domain level redirect applied to API paths, which caused header loss for authenticated requests.
77+
78+
## What went well
79+
80+
- We had env vars backed up in two places. This reduced guesswork during restore.
81+
- Custom domains were reattached quickly.
82+
- We used a clear checklist during restore and avoided risky parallel changes.
83+
- Cap's local Studio Mode remained fully functional. Users could still record videos locally, export them, or save them for later to create shareable links once service was restored.
84+
85+
## Changes we have made
86+
87+
1. **Air‑gap production from local changes**
88+
89+
- Use separate AWS credentials and roles for staging vs production.
90+
- Require an approval step for any plan that touches production resources.
91+
92+
2. **Treat shared external services as data, not resources**
93+
94+
- Never declare long‑lived shared services such as Vercel projects or PlanetScale databases in a way that allows lifecycle control from an app stack.
95+
- Always use retrieval patterns for cross‑stage references. In SST and our wrappers this means `get*` forms only.
96+
97+
3. **Hard guardrails on removal**
98+
99+
- Set `removal: "retain-all"` for anything that can reference production assets.
100+
- Add policy checks that fail a plan if a remove or replace action targets the production Vercel project or DNS.
101+
102+
4. **Incident communications**
103+
- Even if we expect a short interruption, we will notify users promptly inside the app and on status channels. Short incidents can stretch when there are hidden dependencies.
104+
105+
## Closing
106+
107+
The outage was caused by a change that should never have been able to affect production. The fix is not only better configuration. We have changed how we reference shared services, how we protect production from local changes, and how we route authenticated traffic. Thank you for your patience while we worked through this. If you were impacted and need help, contact us and we will make it right.
18 KB
Loading
26.1 KB
Loading

apps/web/public/blog/deep-dive.jpg

102 KB
Loading

apps/web/utils/authors.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
export interface Author {
2+
name: string;
3+
handle: string;
4+
image: string;
5+
}
6+
7+
export const AUTHORS: Record<string, Author> = {
8+
"Richie McIlroy": {
9+
name: "Richie McIlroy",
10+
handle: "richiemcilroy",
11+
image: "/blog/author/richiemcilroy.jpg",
12+
},
13+
"Brendan Allan": {
14+
name: "Brendan Allan",
15+
handle: "brendonovichdev",
16+
image: "/blog/author/brendonovichdev.jpg",
17+
},
18+
};
19+
20+
export function getAuthor(name: string): Author | undefined {
21+
return AUTHORS[name];
22+
}
23+
24+
export function parseAuthors(authorString: string): Author[] {
25+
return authorString
26+
.split(",")
27+
.map((name) => name.trim())
28+
.map((name) => getAuthor(name))
29+
.filter((author): author is Author => author !== undefined);
30+
}

0 commit comments

Comments
 (0)