replicate · paulnegz · Jul 29, 2025 · Jul 29, 2025
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -93,22 +93,32 @@ Common contribution types include: `doc`, `code`, `bug`, and `ideas`. See the fu
 
 ## Development environment
 
-We use the ["scripts to rule them all"](https://github.blog/engineering/engineering-principles/scripts-to-rule-them-all/) philosophy to manage common tasks across the project. These are mostly backed by a Makefile that contains the implementation.
-
 You'll need the following dependencies installed to build Cog locally:
-- [Go](https://golang.org/doc/install): We're targeting 1.24, but you can install the latest version since Go is backwards compatible. If you're using a newer Mac with an M1 chip, be sure to download the `darwin-arm64` installer package. Alternatively you can run `brew install go` which will automatically detect and use the appropriate installer for your system architecture.
-- [uv](https://docs.astral.sh/uv/): Python versions and dependencies are managed by uv.
+
+- [Go](https://golang.org/doc/install): We're targeting 1.23, but you can install the latest version since Go is backwards compatible. If you're using a newer Mac with an M1 chip, be sure to download the `darwin-arm64` installer package. Alternatively you can run `brew install go` which will automatically detect and use the appropriate installer for your system architecture.
+- [uv](https://docs.astral.sh/uv/): Python versions and dependencies are managed by uv, both in development and container environments.
 - [Docker](https://docs.docker.com/desktop) or [OrbStack](https://orbstack.dev)
 
 Install the Python dependencies:
 
     script/setup
 
-Once you have Go installed you can install the cog binary by running:
+Once you have Go installed, run:
+
+    make install
+
+This will build and install the `cog` binary to `/usr/local/bin/cog`. You can then use it to build and run models.
+
+## Package Management
+
+Cog uses [uv](https://docs.astral.sh/uv/) for Python package management, both in development and container environments. This provides:
 
-    make install PREFIX=$(go env GOPATH)
+- Fast, reliable package installation
+- Consistent dependency resolution
+- Efficient caching
+- Reproducible builds
 
-This installs the `cog` binary to `$GOPATH/bin/cog`.
+When building containers, uv is automatically installed and used to install Python packages from requirements.txt files. The cache is mounted at `/srv/r8/uv/cache` to speed up subsequent builds.
 
 To run ALL the tests:
 

diff --git a/pkg/dockerfile/standard_generator.go b/pkg/dockerfile/standard_generator.go
@@ -92,7 +92,7 @@ func NewStandardGenerator(config *config.Config, dir string, command command.Com
 		Config:           config,
 		Dir:              dir,
 		GOOS:             runtime.GOOS,
-		GOARCH:           runtime.GOOS,
+		GOARCH:           runtime.GOARCH,
 		tmpDir:           tmpDir,
 		relativeTmpDir:   relativeTmpDir,
 		fileWalker:       filepath.Walk,
@@ -414,18 +414,17 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked apt-get update -qq &
 	git \
 	ca-certificates \
 	&& rm -rf /var/lib/apt/lists/*
-` + fmt.Sprintf(`
+
+ENV UV_CACHE_DIR="/srv/r8/uv/cache"
 RUN --mount=type=cache,target=/root/.cache/pip curl -s -S -L https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer | bash && \
 	git clone https://github.com/momo-lab/pyenv-install-latest.git "$(pyenv root)"/plugins/pyenv-install-latest && \
 	export PYTHON_CONFIGURE_OPTS='--enable-optimizations --with-lto' && \
 	export PYTHON_CFLAGS='-O3' && \
-	pyenv install-latest "%s" && \
-	pyenv global $(pyenv install-latest --print "%s") && \
-	pip install "wheel<1"`, py, py) + `
+	pyenv install-latest "` + py + `" && \
+	pyenv global $(pyenv install-latest --print "` + py + `") && \
+	curl -LsSf https://astral.sh/uv/install.sh | sh
+
 RUN rm -rf /usr/bin/python3 && ln -s ` + "`realpath \\`pyenv which python\\`` /usr/bin/python3 && chmod +x /usr/bin/python3", nil
-	// for sitePackagesLocation, kind of need to determine which specific version latest is (3.8 -> 3.8.17 or 3.8.18)
-	// install-latest essentially does pyenv install --list | grep $py | tail -1
-	// there are many bad options, but a symlink to $(pyenv prefix) is the least bad one
 }
 
 func (g *StandardGenerator) installCog() (string, error) {
@@ -451,7 +450,7 @@ func (g *StandardGenerator) installCog() (string, error) {
 		cmds := []string{
 			"ENV R8_COG_VERSION=coglet",
 			"ENV R8_PYTHON_VERSION=" + g.Config.Build.PythonVersion,
-			"RUN pip install " + m.LatestCoglet.URL,
+			"RUN --mount=type=cache,target=/srv/r8/uv/cache,id=uv-cache uv pip install " + m.LatestCoglet.URL,
 		}
 		return strings.Join(cmds, "\n"), nil
 	}
@@ -469,13 +468,13 @@ func (g *StandardGenerator) installCog() (string, error) {
 	if err != nil {
 		return "", err
 	}
-	pipInstallLine := "RUN --mount=type=cache,target=/root/.cache/pip pip install --no-cache-dir"
-	pipInstallLine += " " + containerPath
-	pipInstallLine += " 'pydantic>=1.9,<3'"
+	uvInstallLine := "RUN --mount=type=cache,target=/srv/r8/uv/cache,id=uv-cache uv pip install --no-cache-dir"
+	uvInstallLine += " " + containerPath
+	uvInstallLine += " 'pydantic>=1.9,<3'"
 	if g.strip {
-		pipInstallLine += " && " + StripDebugSymbolsCommand
+		uvInstallLine += " && " + StripDebugSymbolsCommand
 	}
-	lines = append(lines, CFlags, pipInstallLine, "ENV CFLAGS=")
+	lines = append(lines, CFlags, uvInstallLine, "ENV CFLAGS=")
 	return strings.Join(lines, "\n"), nil
 }
 
@@ -509,14 +508,14 @@ func (g *StandardGenerator) pipInstalls() (string, error) {
 		return "", err
 	}
 
-	pipInstallLine := "RUN --mount=type=cache,target=/root/.cache/pip pip install -r " + containerPath
+	uvInstallLine := "RUN --mount=type=cache,target=/srv/r8/uv/cache,id=uv-cache uv pip install -r " + containerPath
 	if g.strip {
-		pipInstallLine += " && " + StripDebugSymbolsCommand
+		uvInstallLine += " && " + StripDebugSymbolsCommand
 	}
 	return strings.Join([]string{
 		copyLine[0],
 		CFlags,
-		pipInstallLine,
+		uvInstallLine,
 		"ENV CFLAGS=",
 	}, "\n"), nil
 }

diff --git a/pkg/dockerfile/standard_generator_test.go b/pkg/dockerfile/standard_generator_test.go
@@ -47,7 +47,7 @@ func testInstallCog(relativeTmpDir string, stripped bool) string {
 	}
 	return fmt.Sprintf(`COPY %s/%s /tmp/%s
 ENV CFLAGS="-O3 -funroll-loops -fno-strict-aliasing -flto -S"
-RUN --mount=type=cache,target=/root/.cache/pip pip install --no-cache-dir /tmp/%s 'pydantic>=1.9,<3'%s
+RUN --mount=type=cache,target=/srv/r8/uv/cache,id=uv-cache uv pip install --no-cache-dir /tmp/%s 'pydantic>=1.9,<3'%s
 ENV CFLAGS=`, relativeTmpDir, wheel, wheel, wheel, strippedCall)
 }
 
@@ -73,13 +73,15 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked apt-get update -qq &
 	git \
 	ca-certificates \
 	&& rm -rf /var/lib/apt/lists/*
+
+ENV UV_CACHE_DIR="/srv/r8/uv/cache"
 RUN --mount=type=cache,target=/root/.cache/pip curl -s -S -L https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer | bash && \
 	git clone https://github.com/momo-lab/pyenv-install-latest.git "$(pyenv root)"/plugins/pyenv-install-latest && \
 	export PYTHON_CONFIGURE_OPTS='--enable-optimizations --with-lto' && \
 	export PYTHON_CFLAGS='-O3' && \
 	pyenv install-latest "%s" && \
 	pyenv global $(pyenv install-latest --print "%s") && \
-	pip install "wheel<1"
+	curl -LsSf https://astral.sh/uv/install.sh | sh
 `, version, version)
 }
 
@@ -414,7 +416,7 @@ ENV NVIDIA_DRIVER_CAPABILITIES=all
 ` + testInstallPython("3.12") + `RUN rm -rf /usr/bin/python3 && ln -s ` + "`realpath \\`pyenv which python\\`` /usr/bin/python3 && chmod +x /usr/bin/python3" + `
 COPY ` + gen.relativeTmpDir + `/requirements.txt /tmp/requirements.txt
 ENV CFLAGS="-O3 -funroll-loops -fno-strict-aliasing -flto -S"
-RUN --mount=type=cache,target=/root/.cache/pip pip install -r /tmp/requirements.txt
+RUN --mount=type=cache,target=/srv/r8/uv/cache,id=uv-cache uv pip install -r /tmp/requirements.txt
 ENV CFLAGS=
 ` + testInstallCog(gen.relativeTmpDir, gen.strip) + `
 RUN find / -type f -name "*python*.so" -printf "%h\n" | sort -u > /etc/ld.so.conf.d/cog.conf && ldconfig
@@ -898,3 +900,56 @@ torch==2.3.1
 pandas==2.0.3
 coglet @ https://github.com/replicate/cog-runtime/releases/download/v0.1.0-alpha31/coglet-0.1.0a31-py3-none-any.whl`, string(requirements))
 }
+
+func TestGenerateDockerfileStripped(t *testing.T) {
+	tmpDir := t.TempDir()
+
+	conf, err := config.FromYAML([]byte(`
+build:
+  gpu: true
+  cuda: "11.8"
+  python_version: "3.12"
+  system_packages:
+    - ffmpeg
+    - cowsay
+  python_packages:
+    - torch==2.3.1
+    - pandas==2.0.3
+  run:
+    - "cowsay moo"
+predict: predict.py:Predictor
+`))
+	require.NoError(t, err)
+	require.NoError(t, conf.ValidateAndComplete(""))
+	command := dockertest.NewMockCommand()
+	client := registrytest.NewMockRegistryClient()
+	gen, err := NewStandardGenerator(conf, tmpDir, command, client, true)
+	require.NoError(t, err)
+	gen.SetUseCogBaseImage(true)
+	gen.SetStrip(true)
+	_, actual, _, err := gen.GenerateModelBaseWithSeparateWeights(t.Context(), "r8.im/replicate/cog-test")
+	require.NoError(t, err)
+
+	expected := `#syntax=docker/dockerfile:1.4
+FROM r8.im/replicate/cog-test-weights AS weights
+FROM r8.im/cog-base:cuda11.8-python3.12-torch2.3.1
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked apt-get update -qq && apt-get install -qqy cowsay && rm -rf /var/lib/apt/lists/*
+COPY ` + gen.relativeTmpDir + `/requirements.txt /tmp/requirements.txt
+ENV CFLAGS="-O3 -funroll-loops -fno-strict-aliasing -flto -S"
+RUN --mount=type=cache,target=/srv/r8/uv/cache,id=uv-cache uv pip install -r /tmp/requirements.txt && find / -type f -name "*python*.so" -not -name "*cpython*.so" -exec strip -S {} \;
+ENV CFLAGS=
+RUN find / -type f -name "*.py[co]" -delete && find / -type f -name "*.py" -exec touch -t 197001010000 {} \; && find / -type f -name "*.py" -printf "%h\n" | sort -u | /usr/bin/python3 -m compileall --invalidation-mode timestamp -o 2 -j 0
+RUN cowsay moo
+WORKDIR /src
+EXPOSE 5000
+CMD ["python", "-m", "cog.server.http"]
+COPY . /src`
+
+	require.Equal(t, expected, actual)
+
+	requirements, err := os.ReadFile(path.Join(gen.tmpDir, "requirements.txt"))
+	require.NoError(t, err)
+	require.Equal(t, `--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.3.1
+pandas==2.0.3`, string(requirements))
+}
diff --git a/python/cog/server/webhook.py b/python/cog/server/webhook.py
@@ -1,4 +1,5 @@
 import os
+from concurrent.futures import ThreadPoolExecutor
 from typing import Any, Callable, Set
 
 import requests
@@ -16,6 +17,10 @@
 log = structlog.get_logger(__name__)
 
 _response_interval = float(os.environ.get("COG_THROTTLE_RESPONSE_INTERVAL", 0.5))
+_webhook_timeout = float(
+    os.environ.get("COG_WEBHOOK_TIMEOUT", 10.0)
+)  # 10 second timeout by default
+_webhook_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="webhook")
 
 # HACK: signal that we should skip the start webhook when the response interval
 # is tuned below 100ms. This should help us get output sooner for models that
@@ -27,11 +32,40 @@ def webhook_caller_filtered(
     webhook: str,
     webhook_events_filter: Set[WebhookEvent],
 ) -> Callable[[Any, WebhookEvent], None]:
-    upstream_caller = webhook_caller(webhook)
+    # Create a session for this webhook
+    default_session = requests_session()
+    retry_session = requests_session_with_retries()
+    throttler = ResponseThrottler(response_interval=_response_interval)
+
+    def _send_webhook(response: PredictionResponse, session: requests.Session) -> None:
+        if PYDANTIC_V2:
+            dict_response = jsonable_encoder(response.model_dump(exclude_unset=True))
+        else:
+            dict_response = jsonable_encoder(response.dict(exclude_unset=True))
+
+        try:
+            session.post(webhook, json=dict_response, timeout=_webhook_timeout)
+        except requests.exceptions.Timeout:
+            log.warn("webhook request timed out", webhook=webhook)
+        except requests.exceptions.RequestException:
+            log.warn("caught exception while sending webhook", exc_info=True)
 
     def caller(response: PredictionResponse, event: WebhookEvent) -> None:
-        if event in webhook_events_filter:
-            upstream_caller(response)
+        if event not in webhook_events_filter:
+            return
+
+        if not throttler.should_send_response(response):
+            return
+
+        # Use a separate thread for webhook calls to avoid blocking
+        if Status.is_terminal(response.status):
+            # For terminal updates, retry persistently but in background
+            _webhook_executor.submit(_send_webhook, response, retry_session)
+        else:
+            # For other requests, don't retry, and ignore any errors
+            _webhook_executor.submit(_send_webhook, response, default_session)
+
+        throttler.update_last_sent_response_time()
 
     return caller
 
@@ -44,24 +78,32 @@ def webhook_caller(webhook: str) -> Callable[[Any], None]:
     default_session = requests_session()
     retry_session = requests_session_with_retries()
 
+    def _send_webhook(response: PredictionResponse, session: requests.Session) -> None:
+        if PYDANTIC_V2:
+            dict_response = jsonable_encoder(response.model_dump(exclude_unset=True))
+        else:
+            dict_response = jsonable_encoder(response.dict(exclude_unset=True))
+
+        try:
+            session.post(webhook, json=dict_response, timeout=_webhook_timeout)
+        except requests.exceptions.Timeout:
+            log.warn("webhook request timed out", webhook=webhook)
+        except requests.exceptions.RequestException:
+            log.warn("caught exception while sending webhook", exc_info=True)
+
     def caller(response: PredictionResponse) -> None:
-        if throttler.should_send_response(response):
-            if PYDANTIC_V2:
-                dict_response = jsonable_encoder(
-                    response.model_dump(exclude_unset=True)
-                )
-            else:
-                dict_response = jsonable_encoder(response.dict(exclude_unset=True))
-            if Status.is_terminal(response.status):
-                # For terminal updates, retry persistently
-                retry_session.post(webhook, json=dict_response)
-            else:
-                # For other requests, don't retry, and ignore any errors
-                try:
-                    default_session.post(webhook, json=dict_response)
-                except requests.exceptions.RequestException:
-                    log.warn("caught exception while sending webhook", exc_info=True)
-            throttler.update_last_sent_response_time()
+        if not throttler.should_send_response(response):
+            return
+
+        # Use a separate thread for webhook calls to avoid blocking
+        if Status.is_terminal(response.status):
+            # For terminal updates, retry persistently but in background
+            _webhook_executor.submit(_send_webhook, response, retry_session)
+        else:
+            # For other requests, don't retry, and ignore any errors
+            _webhook_executor.submit(_send_webhook, response, default_session)
+
+        throttler.update_last_sent_response_time()
 
     return caller
 
@@ -84,13 +126,12 @@ def requests_session() -> requests.Session:
 
 
 def requests_session_with_retries() -> requests.Session:
-    # This session will retry requests up to 12 times, with exponential
-    # backoff. In total it'll try for up to roughly 320 seconds, providing
-    # resilience through temporary networking and availability issues.
+    # This session will retry requests up to 6 times (reduced from 12), with exponential
+    # backoff. In total it'll try for up to roughly 60 seconds (reduced from 320s).
     session = requests_session()
     adapter = HTTPAdapter(
         max_retries=Retry(
-            total=12,
+            total=6,  # Reduced from 12 to avoid blocking too long
             backoff_factor=0.1,
             status_forcelist=[429, 500, 502, 503, 504],
             allowed_methods=["POST"],