From fccf33653e8f9b555711f3dbbf20d919393466ac Mon Sep 17 00:00:00 2001 From: romain-priour-lc Date: Fri, 13 Jun 2025 17:39:55 -0700 Subject: [PATCH 1/7] page1 for exposing telemetry done --- docs/self_hosting/index.md | 2 + .../observability/export_backend.mdx | 86 +++++++++++++++++++ docs/self_hosting/observability/index.md | 16 ++++ .../observability/langsmith_collector.mdx | 12 +++ 4 files changed, 116 insertions(+) create mode 100644 docs/self_hosting/observability/export_backend.mdx create mode 100644 docs/self_hosting/observability/index.md create mode 100644 docs/self_hosting/observability/langsmith_collector.mdx diff --git a/docs/self_hosting/index.md b/docs/self_hosting/index.md index d45fbf237..230139112 100644 --- a/docs/self_hosting/index.md +++ b/docs/self_hosting/index.md @@ -34,3 +34,5 @@ Step-by-step guides that cover the installation, configuration, and scaling of y - [Week of January 29, 2024 - LangSmith v0.2](./self_hosting/release_notes#week-of-january-29-2024---langsmith-v02): Release notes for version 0.2 of LangSmith. - [FAQ](./self_hosting/faq): Frequently asked questions about LangSmith. - [Troubleshooting](./self_hosting/troubleshooting): Troubleshooting common issues with your Self-Hosted LangSmith instance. +- [Observability](./self_hosting/observability): How to access telemetry data for your self-hosted LangSmith instance. + - [Export LangSmith telemetry to your backend](./self_hosting/observability/export_backend): How to export telemetry data from LangSmith to your observability backend. diff --git a/docs/self_hosting/observability/export_backend.mdx b/docs/self_hosting/observability/export_backend.mdx new file mode 100644 index 000000000..ffde9d9fa --- /dev/null +++ b/docs/self_hosting/observability/export_backend.mdx @@ -0,0 +1,86 @@ +--- +sidebar_label: Export LangSmith Telemetry +sidebar_position: 9 +--- + +# Exporting LangSmith telemetry to your observability backend + +:::warning Important +**This section is only applicable for Kubernetes deployments.** +::: + +Self-Hosted LangSmith instances produce telemetry data in the form of logs, metrics and traces. This section will show you how to access all of that data, and how to export that data to +your observability collector or backend. + +This section assumes that you have monitoring infrastructure set up already, or you will set up this infrastructure and you want to know how to configure it as well as LangSmith to collect data. + +Infrastructure refers to: +- A collector, such as [OpenTelemetry](https://opentelemetry.io/docs/collector/), [FluentBit](https://docs.fluentbit.io/manual) or [Prometheus](https://prometheus.io/) +- An observability backend, such as [Datadog](https://www.datadoghq.com/) + +If you would like LangSmith to bring up a collector or a full observability stack, check the other pages under the [Self-Hosted Observability](/self_hosting/observability) section. + +## Logs +All services that are part of the LangSmith self-hosted deployment write their logs to their node's filesystem. This includes Postgres, Redis and Clickhouse if you are running the default in-cluter versions. +In order to access these logs, you need to set up your collector to read from said files. Most popular collectors support reading file logs. + +For example: +- OpenTelemetry: [File Log Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver) +- FluentBit: [Tail Input](https://docs.fluentbit.io/manual/pipeline/inputs/tail) +- Datadog: [Kubernetes Log Collection](https://docs.datadoghq.com/containers/kubernetes/log/?tab=datadogoperator) + +## Metrics + ### LangSmith Services + The following LangSmith services expose metrics at an endpoint, in the Prometheus metrics format. + - Backend Service: `http://langsmith-backend..svc.cluster.local:1984/metrics` + - Platform Backend Service: `http://langsmith-platform-backend..svc.cluster.local:1986/metrics` + - Host Backend Service: `http://host-backend..svc.cluster.local:1985/metrics` + - Playground Service: `http://langsmith-playground..svc.cluster.local:1988/metrics` + + It is recommended to use a [Prometheus server](https://prometheus.io/docs/prometheus/latest/getting_started/#configure-prometheus-to-monitor-the-sample-targets) or + [OpenTelemetry collector](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) to scrape the endpoint, and export it to the + backend of your choice. + + :::warning Important + **The following sections apply for in-cluster databases only. If you are using external databases, you will need to configure exposing and fetching metrics.** + ::: + ### Redis + If you are using the in-cluster Redis instance from the Helm chart, LangSmith can expose metrics for you if you upgrade the chart with the following values: + ```yaml + redis: + metrics: + enabled: true + ``` + This will run a sidecar container alongside your redis instance which will expose Prometheus metrics at: `http://langsmith-..svc.cluster.local:9121/metrics` + + ### Postgres + Similarly, to expose Postgres metrics, upgrade the LangSmith Helm chart with the following values: + ```yaml + postgres: + metrics: + enabled: true + ``` + This will run a sidecar container, exposing Prometheus metrics at `http://langsmith-..svc.cluster.local:9187/metrics` + + ### Clickhouse + The Clickhouse container can expose metrics directly, without the need for a sidecar. To expose the metrics endpoint, run the LangSmith Helm chart with the + following values: + ```yaml + clickhouse: + metrics: + enabled: true + ``` + You can then scrape metrics at `http://langsmith-..svc.cluster.local:9363/metrics` + +## Traces +The LangSmith Backend, LangSmith Platform Backend, LangSmith Queue, and LangSmith Playground services have been instrumented using the OTEL SDK to emit +traces adhering to the [OpenTelemetry format](https://opentelemetry.io/docs/concepts/signals/traces/). Tracing is toggled off by default, and can be enabled +by adding the following values to your configuration file, and upgrading your LangSmith helm chart: + ```yaml + config: + tracing: + enabled: true + endpoint: "" + useTls: true + ``` +This will export traces from all LangSmith backend services to the specified endpoint. diff --git a/docs/self_hosting/observability/index.md b/docs/self_hosting/observability/index.md new file mode 100644 index 000000000..03072824b --- /dev/null +++ b/docs/self_hosting/observability/index.md @@ -0,0 +1,16 @@ +--- +sidebar_label: Self-Hosted Observability +sidebar_position: 11 +description: "Observability guides for LangSmith" +--- + +# Self-Hosted Observability + +This section contains guides for accessing telemetry data for your self-hosted LangSmith deployments. + +:::warning Important +**This section is only applicable for Kubernetes deployments.** +::: + +- [Export LangSmith Telemetry](./observability/export_backend): Export logs, metrics and traces to your collector/backend of choice. +- [Install a Collector for LangSmith Telemetry](./observability/langsmith_collector): How to use the LangSmith observability [helm chart](https://github.com/langchain-ai/helm) to deploy an OpenTelemetry Collector and export telemetry to your backend. \ No newline at end of file diff --git a/docs/self_hosting/observability/langsmith_collector.mdx b/docs/self_hosting/observability/langsmith_collector.mdx new file mode 100644 index 000000000..a3b193133 --- /dev/null +++ b/docs/self_hosting/observability/langsmith_collector.mdx @@ -0,0 +1,12 @@ +--- +sidebar_label: Bring up a LangSmith OTEL Collector +sidebar_position: 9 +--- + +# Exporting LangSmith telemetry to your observability backend + +Self-Hosted LangSmith instances produce telemetry data in the form of logs, metrics and traces + +:::warning Important +**This section is only applicable for Kubernetes deployments.** +::: \ No newline at end of file From a9e6d13bd16f1660b3f26671c1e18e2d55aeba58 Mon Sep 17 00:00:00 2001 From: romain-priour-lc Date: Mon, 16 Jun 2025 15:11:18 -0700 Subject: [PATCH 2/7] collector configuration --- .../observability/langsmith_collector.mdx | 193 +++++++++++++++++- 1 file changed, 189 insertions(+), 4 deletions(-) diff --git a/docs/self_hosting/observability/langsmith_collector.mdx b/docs/self_hosting/observability/langsmith_collector.mdx index a3b193133..a72c50e5d 100644 --- a/docs/self_hosting/observability/langsmith_collector.mdx +++ b/docs/self_hosting/observability/langsmith_collector.mdx @@ -1,12 +1,197 @@ --- -sidebar_label: Bring up a LangSmith OTEL Collector +sidebar_label: OTEL Collector Configuration sidebar_position: 9 --- -# Exporting LangSmith telemetry to your observability backend +# Configure your Telemetry collector to scrape LangSmith services -Self-Hosted LangSmith instances produce telemetry data in the form of logs, metrics and traces +As seen in the previous section, the various services in a LangSmith deployment emit telemetry data in the form of logs, metrics and traces. +You may already have telemetry collectors set up in your Kubernetes cluster, or would like to deploy one to monitor your application. + +This section will show you the way we would configure an [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/configuration/) to export telemetry data from LangSmith. +Note that all of the concepts discussed below can be translated to other collectors such as [Fluentd](https://www.fluentd.org/) or [FluentBit](https://fluentbit.io/). :::warning Important **This section is only applicable for Kubernetes deployments.** -::: \ No newline at end of file +::: + +## Receivers +### Logs +As discussed previously, logs are read from the filesystem of the nodes/containers running the application. An example configuration for reading logs from files: +```yaml +filelog: + exclude: [] + include: + - /var/log/pods/*/*/*.log + include_file_name: false + include_file_path: true + operators: + - id: container-parser + max_log_size: 102400 + type: container + retry_on_failure: + enabled: true + start_at: end +``` + +:::note Note +**The above configuration reads all logs from all files. If you would like to only read LangSmith logs, you need to either:** + +1. Only include files from containers in your LangSmith namespace. +2. Filter out logs from other namespaces in your processing logic. +::: + +### Metrics +Metrics can be scraped using the Prometheus endpoints. The configuration below will scrape all database and service metrics: +```yaml +prometheus: │ + scrape_configs: │ + - job_name: database-metrics │ + scrape_interval: 15s │ + static_configs: │ + - targets: │ + - ..svc.cluster.local:/metrics │ + - ..svc.cluster.local:/metrics │ + - ..svc.cluster.local:/metrics │ + - job_name: service-metrics │ + scrape_interval: 15s │ + static_configs: │ + - targets: │ + - ..svc.cluster.local:/metrics │ + - ..svc.cluster.local:/metrics │ + - ..svc.cluster.local:/metrics │ + - ..svc.cluster.local:/metrics +``` + +### Traces +For traces, you need to enable the OTLP receiver. The following configuration can be used to listen to HTTP traces on port 4318, and GRPC on port 4317: +``` +otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 +``` + +## Processors +### Recommended OTEL Processors +The following processors are recommended when using the OTEL collector: + +- [Batch Processor](https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/batchprocessor/README.md): Groups the data into batches before sending to exporters. +- [Memory Limiter](https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/memorylimiterprocessor/README.md): Prevents the collector from using too much memory and crashing. When the soft limit is crossed, +the collector stops accepting new data. +- [K8s Attributes Processor] (https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/k8sattributesprocessor): Adds K8s metadata such as pod name into the logs, metrics and traces + +## Exporters +Exporters just need to point to an external endpoint of your liking. The following configuration allows you to configure a separate endpoint for logs, metrics and traces: +```yaml +otlphttp/logs: + endpoint: + tls: false +otlphttp/metrics: + endpoint: + tls: false +otlphttp/traces: + endpoint: + tls: false +``` + +:::note Note +**The OTEL Collector also supports exporting directly to a [Datadog](https://docs.datadoghq.com/opentelemetry/setup/collector_exporter) endpoint.** +::: + +# Example Collector Configuration: +Note that this configuration uses a filter to drop any logs from files other than LangSmith ones in our namespace. +```yaml +receivers: + filelog: + exclude: [] + include: + - /var/log/pods/*/*/*.log + include_file_name: false + include_file_path: true + operators: + - id: container-parser + max_log_size: 102400 + type: container + retry_on_failure: + enabled: true + start_at: end + + prometheus: │ + scrape_configs: │ + - job_name: database-metrics │ + scrape_interval: 15s │ + static_configs: │ + - targets: │ + - ..svc.cluster.local:/metrics │ + - ..svc.cluster.local:/metrics │ + - ..svc.cluster.local:/metrics │ + - job_name: service-metrics │ + scrape_interval: 15s │ + static_configs: │ + - targets: │ + - ..svc.cluster.local:/metrics │ + - ..svc.cluster.local:/metrics │ + - ..svc.cluster.local:/metrics │ + - ..svc.cluster.local:/metrics + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + send_batch_size: 8192 + timeout: 1s + memory_limiter: + check_interval: 1m + limit_percentage: 75 + spike_limit_percentage: 25 + filter: + error_mode: ignore + logs: + log_record: + - 'resource.attributes["k8s.namespace.name"] != ""' + - 'resource.attributes["k8s.app.name"] != ""' + k8sattributes: + extract: + labels: + - from: pod + key: app.kubernetes.io/name + tag_name: k8s.app.name + metadata: + - k8s.namespace.name + +exporters: + otlphttp/logs: + endpoint: + tls: + insecure: false +otlphttp/metrics: + endpoint: + tls: + insecure: false +otlphttp/traces: + endpoint: + tls: + insecure: false + +service: + pipelines: + logs/langsmith: + receivers: [filelog] + processors: [k8sattributes, filter, batch, memory_limiter] + exporters: [otlphttp/logs] + metrics/langsmith: + receivers: [prometheus] + processors: [batch, memory_limiter] + exporters: [otlphttp/metrics] + traces/langsmith: + receivers: [otlp] + processors: [batch, memory_limiter] + exporters: [otlphttp/traces] +``` From 674f19ee2eebf53b5531fc820d8972e6379d6354 Mon Sep 17 00:00:00 2001 From: romain-priour-lc Date: Mon, 16 Jun 2025 16:51:54 -0700 Subject: [PATCH 3/7] docs done for OTEL + telemetry data --- docs/self_hosting/index.md | 3 +- .../observability/export_backend.mdx | 81 ++++++------ docs/self_hosting/observability/index.md | 10 +- .../observability/langsmith_collector.mdx | 123 ++++++++++-------- 4 files changed, 119 insertions(+), 98 deletions(-) diff --git a/docs/self_hosting/index.md b/docs/self_hosting/index.md index 230139112..d7cc06d37 100644 --- a/docs/self_hosting/index.md +++ b/docs/self_hosting/index.md @@ -35,4 +35,5 @@ Step-by-step guides that cover the installation, configuration, and scaling of y - [FAQ](./self_hosting/faq): Frequently asked questions about LangSmith. - [Troubleshooting](./self_hosting/troubleshooting): Troubleshooting common issues with your Self-Hosted LangSmith instance. - [Observability](./self_hosting/observability): How to access telemetry data for your self-hosted LangSmith instance. - - [Export LangSmith telemetry to your backend](./self_hosting/observability/export_backend): How to export telemetry data from LangSmith to your observability backend. + - [Export LangSmith telemetry](./self_hosting/observability/export_backend): Export logs, metrics and traces to your collector and/or backend of choice. + - [Collector configuration](./self_hosting/observability/langsmith_collector): Example yaml configurations for an OTel collector to gather LangSmith telemetry data. diff --git a/docs/self_hosting/observability/export_backend.mdx b/docs/self_hosting/observability/export_backend.mdx index ffde9d9fa..b57df5a4e 100644 --- a/docs/self_hosting/observability/export_backend.mdx +++ b/docs/self_hosting/observability/export_backend.mdx @@ -9,38 +9,40 @@ sidebar_position: 9 **This section is only applicable for Kubernetes deployments.** ::: -Self-Hosted LangSmith instances produce telemetry data in the form of logs, metrics and traces. This section will show you how to access all of that data, and how to export that data to -your observability collector or backend. +Self-Hosted LangSmith instances produce telemetry data in the form of logs, metrics and traces. This section will show you how to access and export that data to +an observability collector or backend. -This section assumes that you have monitoring infrastructure set up already, or you will set up this infrastructure and you want to know how to configure it as well as LangSmith to collect data. +This section assumes that you have monitoring infrastructure set up already, or you will set up this infrastructure and want to know how to configure LangSmith to collect data from it. Infrastructure refers to: -- A collector, such as [OpenTelemetry](https://opentelemetry.io/docs/collector/), [FluentBit](https://docs.fluentbit.io/manual) or [Prometheus](https://prometheus.io/) -- An observability backend, such as [Datadog](https://www.datadoghq.com/) -If you would like LangSmith to bring up a collector or a full observability stack, check the other pages under the [Self-Hosted Observability](/self_hosting/observability) section. +- Collectors, such as [OpenTelemetry](https://opentelemetry.io/docs/collector/), [FluentBit](https://docs.fluentbit.io/manual) or [Prometheus](https://prometheus.io/) +- Observability backends, such as [Datadog](https://www.datadoghq.com/) or [the Grafana ecosystem](https://grafana.com/) -## Logs -All services that are part of the LangSmith self-hosted deployment write their logs to their node's filesystem. This includes Postgres, Redis and Clickhouse if you are running the default in-cluter versions. -In order to access these logs, you need to set up your collector to read from said files. Most popular collectors support reading file logs. +## Logs: [OTel Example](./langsmith_collector#logs) -For example: -- OpenTelemetry: [File Log Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver) -- FluentBit: [Tail Input](https://docs.fluentbit.io/manual/pipeline/inputs/tail) -- Datadog: [Kubernetes Log Collection](https://docs.datadoghq.com/containers/kubernetes/log/?tab=datadogoperator) +All services that are part of the LangSmith self-hosted deployment write their logs to their node/container filesystem. This includes Postgres, Redis and Clickhouse if you are running the default in-cluter versions. +In order to access these logs, you need to set up your collector to read from those files. Most popular collectors support reading logs from container filsystems. + +Example file system integrations: + +- **OpenTelemetry**: [File Log Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver) +- **FluentBit**: [Tail Input](https://docs.fluentbit.io/manual/pipeline/inputs/tail) +- **Datadog**: [Kubernetes Log Collection](https://docs.datadoghq.com/containers/kubernetes/log/?tab=datadogoperator) + +## Metrics: [OTel Example](./langsmith_collector#metrics) -## Metrics ### LangSmith Services The following LangSmith services expose metrics at an endpoint, in the Prometheus metrics format. - - Backend Service: `http://langsmith-backend..svc.cluster.local:1984/metrics` - - Platform Backend Service: `http://langsmith-platform-backend..svc.cluster.local:1986/metrics` - - Host Backend Service: `http://host-backend..svc.cluster.local:1985/metrics` - - Playground Service: `http://langsmith-playground..svc.cluster.local:1988/metrics` - - It is recommended to use a [Prometheus server](https://prometheus.io/docs/prometheus/latest/getting_started/#configure-prometheus-to-monitor-the-sample-targets) or - [OpenTelemetry collector](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) to scrape the endpoint, and export it to the + - Backend: `http://..svc.cluster.local:1984/metrics` + - Platform Backend: `http://..svc.cluster.local:1986/metrics` + - Host Backend: `http:/..svc.cluster.local:1985/metrics` + - Playground: `http://..svc.cluster.local:1988/metrics` + + It is recommended to use a [Prometheus server](https://prometheus.io/docs/prometheus/latest/getting_started/#configure-prometheus-to-monitor-the-sample-targets) or + [OpenTelemetry collector](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) to scrape the endpoint, and export it to the backend of your choice. - + :::warning Important **The following sections apply for in-cluster databases only. If you are using external databases, you will need to configure exposing and fetching metrics.** ::: @@ -51,8 +53,8 @@ For example: metrics: enabled: true ``` - This will run a sidecar container alongside your redis instance which will expose Prometheus metrics at: `http://langsmith-..svc.cluster.local:9121/metrics` - + This will run a sidecar container alongside your redis container which will expose Prometheus metrics at: `http://langsmith-..svc.cluster.local:9121/metrics` + ### Postgres Similarly, to expose Postgres metrics, upgrade the LangSmith Helm chart with the following values: ```yaml @@ -60,10 +62,10 @@ For example: metrics: enabled: true ``` - This will run a sidecar container, exposing Prometheus metrics at `http://langsmith-..svc.cluster.local:9187/metrics` - + This will run a sidecar container alongside Postgres, exposing Prometheus metrics at `http://langsmith-..svc.cluster.local:9187/metrics` + ### Clickhouse - The Clickhouse container can expose metrics directly, without the need for a sidecar. To expose the metrics endpoint, run the LangSmith Helm chart with the + The Clickhouse container can expose metrics directly, without the need for a sidecar. To expose the metrics endpoint, run the LangSmith Helm chart with the following values: ```yaml clickhouse: @@ -72,15 +74,20 @@ For example: ``` You can then scrape metrics at `http://langsmith-..svc.cluster.local:9363/metrics` -## Traces -The LangSmith Backend, LangSmith Platform Backend, LangSmith Queue, and LangSmith Playground services have been instrumented using the OTEL SDK to emit +## Traces [OTel Example](./langsmith_collector#traces) + +The LangSmith Backend, Platform Backend, and Playground services have been instrumented using the OTEL SDK to emit traces adhering to the [OpenTelemetry format](https://opentelemetry.io/docs/concepts/signals/traces/). Tracing is toggled off by default, and can be enabled -by adding the following values to your configuration file, and upgrading your LangSmith helm chart: - ```yaml - config: - tracing: - enabled: true - endpoint: "" - useTls: true - ``` +and customized with the following in your `values.yaml` file: + +```yaml +config: +tracing: +enabled: true +endpoint: "" +useTls: true # Or false +env: "ls_self_hosted" # This value will be set as an "env" attribute in your spans +exporter: "http" # must be either http or grpc +``` + This will export traces from all LangSmith backend services to the specified endpoint. diff --git a/docs/self_hosting/observability/index.md b/docs/self_hosting/observability/index.md index 03072824b..da9f95683 100644 --- a/docs/self_hosting/observability/index.md +++ b/docs/self_hosting/observability/index.md @@ -6,11 +6,7 @@ description: "Observability guides for LangSmith" # Self-Hosted Observability -This section contains guides for accessing telemetry data for your self-hosted LangSmith deployments. +This section contains guides for accessing telemetry data for your self-hosted LangSmith deployments. -:::warning Important -**This section is only applicable for Kubernetes deployments.** -::: - -- [Export LangSmith Telemetry](./observability/export_backend): Export logs, metrics and traces to your collector/backend of choice. -- [Install a Collector for LangSmith Telemetry](./observability/langsmith_collector): How to use the LangSmith observability [helm chart](https://github.com/langchain-ai/helm) to deploy an OpenTelemetry Collector and export telemetry to your backend. \ No newline at end of file +- [Export LangSmith Telemetry](./observability/export_backend): Export logs, metrics and traces to your collector and/or backend of choice. +- [Configure a Collector for LangSmith Telemetry](./observability/langsmith_collector): Example yaml configurations for an OTel collector to gather LangSmith telemetry data. diff --git a/docs/self_hosting/observability/langsmith_collector.mdx b/docs/self_hosting/observability/langsmith_collector.mdx index a72c50e5d..d33b6e835 100644 --- a/docs/self_hosting/observability/langsmith_collector.mdx +++ b/docs/self_hosting/observability/langsmith_collector.mdx @@ -1,14 +1,14 @@ --- -sidebar_label: OTEL Collector Configuration +sidebar_label: Collector Configuration sidebar_position: 9 --- -# Configure your Telemetry collector to scrape LangSmith services +# Configure your Collector to gather LangSmith telemetry As seen in the previous section, the various services in a LangSmith deployment emit telemetry data in the form of logs, metrics and traces. You may already have telemetry collectors set up in your Kubernetes cluster, or would like to deploy one to monitor your application. -This section will show you the way we would configure an [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/configuration/) to export telemetry data from LangSmith. +This section will show you how to configure an [OTel Collector](https://opentelemetry.io/docs/collector/configuration/) to gather telemetry data from LangSmith. Note that all of the concepts discussed below can be translated to other collectors such as [Fluentd](https://www.fluentd.org/) or [FluentBit](https://fluentbit.io/). :::warning Important @@ -16,13 +16,16 @@ Note that all of the concepts discussed below can be translated to other collect ::: ## Receivers + ### Logs + As discussed previously, logs are read from the filesystem of the nodes/containers running the application. An example configuration for reading logs from files: + ```yaml filelog: exclude: [] include: - - /var/log/pods/*/*/*.log + - /var/log/pods/*/*/*.log include_file_name: false include_file_path: true operators: @@ -35,56 +38,68 @@ filelog: ``` :::note Note -**The above configuration reads all logs from all files. If you would like to only read LangSmith logs, you need to either:** +**The above configuration reads logs from all files in the cluster that the collector has access to. If you would like to only read LangSmith logs, you need to either:** -1. Only include files from containers in your LangSmith namespace. -2. Filter out logs from other namespaces in your processing logic. -::: +1. **Only include files from containers in your LangSmith namespace.** + + or + +2. **Filter out logs from other namespaces and/or application in your processing logic.** + ::: ### Metrics -Metrics can be scraped using the Prometheus endpoints. The configuration below will scrape all database and service metrics: + +Metrics can be scraped using the Prometheus endpoints. The configuration below scraps all LangSmith database and service metrics: + ```yaml -prometheus: │ - scrape_configs: │ - - job_name: database-metrics │ - scrape_interval: 15s │ - static_configs: │ - - targets: │ - - ..svc.cluster.local:/metrics │ - - ..svc.cluster.local:/metrics │ - - ..svc.cluster.local:/metrics │ - - job_name: service-metrics │ - scrape_interval: 15s │ - static_configs: │ - - targets: │ - - ..svc.cluster.local:/metrics │ - - ..svc.cluster.local:/metrics │ - - ..svc.cluster.local:/metrics │ - - ..svc.cluster.local:/metrics +prometheus: + config: + scrape_configs: + - job_name: database-metrics + scrape_interval: 15s + static_configs: + - targets: + - ..svc.cluster.local:/metrics + - ..svc.cluster.local:/metrics + - ..svc.cluster.local:/metrics + - job_name: service-metrics + scrape_interval: 15s + static_configs: + - targets: + - ..svc.cluster.local:/metrics + - ..svc.cluster.local:/metrics + - ..svc.cluster.local:/metrics + - ..svc.cluster.local:/metrics ``` ### Traces + For traces, you need to enable the OTLP receiver. The following configuration can be used to listen to HTTP traces on port 4318, and GRPC on port 4317: + ``` otlp: protocols: - grpc: + grpc: endpoint: 0.0.0.0:4317 - http: + http: endpoint: 0.0.0.0:4318 ``` ## Processors + ### Recommended OTEL Processors -The following processors are recommended when using the OTEL collector: + +The following processors are recommended when using the OTel collector: - [Batch Processor](https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/batchprocessor/README.md): Groups the data into batches before sending to exporters. - [Memory Limiter](https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/memorylimiterprocessor/README.md): Prevents the collector from using too much memory and crashing. When the soft limit is crossed, -the collector stops accepting new data. -- [K8s Attributes Processor] (https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/k8sattributesprocessor): Adds K8s metadata such as pod name into the logs, metrics and traces + the collector stops accepting new data. +- [Kubernetes Attributes Processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/k8sattributesprocessor): Adds Kubernetes metadata such as pod name into the telemetry data. ## Exporters + Exporters just need to point to an external endpoint of your liking. The following configuration allows you to configure a separate endpoint for logs, metrics and traces: + ```yaml otlphttp/logs: endpoint: @@ -98,11 +113,13 @@ otlphttp/traces: ``` :::note Note -**The OTEL Collector also supports exporting directly to a [Datadog](https://docs.datadoghq.com/opentelemetry/setup/collector_exporter) endpoint.** +**The OTel Collector also supports exporting directly to a [Datadog](https://docs.datadoghq.com/opentelemetry/setup/collector_exporter) endpoint.** ::: # Example Collector Configuration: -Note that this configuration uses a filter to drop any logs from files other than LangSmith ones in our namespace. + +Note that this configuration uses a filter to drop any logs from files other than LangSmith logs in the deployment namespace. + ```yaml receivers: filelog: @@ -118,29 +135,29 @@ receivers: retry_on_failure: enabled: true start_at: end - - prometheus: │ - scrape_configs: │ - - job_name: database-metrics │ - scrape_interval: 15s │ - static_configs: │ - - targets: │ - - ..svc.cluster.local:/metrics │ - - ..svc.cluster.local:/metrics │ - - ..svc.cluster.local:/metrics │ - - job_name: service-metrics │ - scrape_interval: 15s │ - static_configs: │ - - targets: │ - - ..svc.cluster.local:/metrics │ - - ..svc.cluster.local:/metrics │ - - ..svc.cluster.local:/metrics │ + + prometheus: + scrape_configs: + - job_name: database-metrics + scrape_interval: 15s + static_configs: + - targets: + - ..svc.cluster.local:/metrics + - ..svc.cluster.local:/metrics + - ..svc.cluster.local:/metrics + - job_name: service-metrics + scrape_interval: 15s + static_configs: + - targets: + - ..svc.cluster.local:/metrics + - ..svc.cluster.local:/metrics + - ..svc.cluster.local:/metrics - ..svc.cluster.local:/metrics otlp: protocols: - grpc: + grpc: endpoint: 0.0.0.0:4317 - http: + http: endpoint: 0.0.0.0:4318 processors: @@ -179,7 +196,7 @@ otlphttp/traces: endpoint: tls: insecure: false - + service: pipelines: logs/langsmith: From b40e80c37d9d4d1dff83b7cb7bfa825fa78b57a0 Mon Sep 17 00:00:00 2001 From: romain-priour-lc Date: Mon, 16 Jun 2025 17:15:54 -0700 Subject: [PATCH 4/7] fix indentation on traceS --- docs/self_hosting/observability/export_backend.mdx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/self_hosting/observability/export_backend.mdx b/docs/self_hosting/observability/export_backend.mdx index b57df5a4e..0eaa31217 100644 --- a/docs/self_hosting/observability/export_backend.mdx +++ b/docs/self_hosting/observability/export_backend.mdx @@ -82,12 +82,12 @@ and customized with the following in your `values.yaml` file: ```yaml config: -tracing: -enabled: true -endpoint: "" -useTls: true # Or false -env: "ls_self_hosted" # This value will be set as an "env" attribute in your spans -exporter: "http" # must be either http or grpc + tracing: + enabled: true + endpoint: "" + useTls: true # Or false + env: "ls_self_hosted" # This value will be set as an "env" attribute in your spans + exporter: "http" # must be either http or grpc ``` This will export traces from all LangSmith backend services to the specified endpoint. From bdf5cc9a132cb0af2dc3db3e4cc385519a6c5593 Mon Sep 17 00:00:00 2001 From: romain-priour-lc Date: Tue, 17 Jun 2025 11:46:52 -0700 Subject: [PATCH 5/7] tested with helm, working --- .../observability/export_backend.mdx | 16 +++- .../observability/langsmith_collector.mdx | 73 ++++++++++--------- 2 files changed, 52 insertions(+), 37 deletions(-) diff --git a/docs/self_hosting/observability/export_backend.mdx b/docs/self_hosting/observability/export_backend.mdx index 0eaa31217..d775ce6a7 100644 --- a/docs/self_hosting/observability/export_backend.mdx +++ b/docs/self_hosting/observability/export_backend.mdx @@ -36,7 +36,6 @@ Example file system integrations: The following LangSmith services expose metrics at an endpoint, in the Prometheus metrics format. - Backend: `http://..svc.cluster.local:1984/metrics` - Platform Backend: `http://..svc.cluster.local:1986/metrics` - - Host Backend: `http:/..svc.cluster.local:1985/metrics` - Playground: `http://..svc.cluster.local:1988/metrics` It is recommended to use a [Prometheus server](https://prometheus.io/docs/prometheus/latest/getting_started/#configure-prometheus-to-monitor-the-sample-targets) or @@ -64,6 +63,10 @@ Example file system integrations: ``` This will run a sidecar container alongside Postgres, exposing Prometheus metrics at `http://langsmith-..svc.cluster.local:9187/metrics` + :::note Note + **You can modify the Redis and Postgres exporter configurations through the LangSmith Helm chart.** + ::: + ### Clickhouse The Clickhouse container can expose metrics directly, without the need for a sidecar. To expose the metrics endpoint, run the LangSmith Helm chart with the following values: @@ -74,9 +77,9 @@ Example file system integrations: ``` You can then scrape metrics at `http://langsmith-..svc.cluster.local:9363/metrics` -## Traces [OTel Example](./langsmith_collector#traces) +## Traces: [OTel Example](./langsmith_collector#traces) -The LangSmith Backend, Platform Backend, and Playground services have been instrumented using the OTEL SDK to emit +The LangSmith Backend, Platform Backend, Playground and LangSmith Queue deployments have been instrumented using the OTEL SDK to emit traces adhering to the [OpenTelemetry format](https://opentelemetry.io/docs/concepts/signals/traces/). Tracing is toggled off by default, and can be enabled and customized with the following in your `values.yaml` file: @@ -91,3 +94,10 @@ config: ``` This will export traces from all LangSmith backend services to the specified endpoint. + +:::important Important +You can override the tracing endpoint for individual services. The Python apps require an endpoint in the form +`http://host:port/v1/traces`, while the Go app requires the same endpoint in the form `host:port` to send to the same collector. + +Make sure to check the logs of your services. If the endpoint is set correctly, there should be no logs. Otherwise, error logs will be shown. +::: diff --git a/docs/self_hosting/observability/langsmith_collector.mdx b/docs/self_hosting/observability/langsmith_collector.mdx index d33b6e835..9ff0ae953 100644 --- a/docs/self_hosting/observability/langsmith_collector.mdx +++ b/docs/self_hosting/observability/langsmith_collector.mdx @@ -59,17 +59,19 @@ prometheus: scrape_interval: 15s static_configs: - targets: - - ..svc.cluster.local:/metrics - - ..svc.cluster.local:/metrics - - ..svc.cluster.local:/metrics + - ..svc.cluster.local: + - ..svc.cluster.local: + - ..svc.cluster.local:..svc.cluster.local:/metrics - - ..svc.cluster.local:/metrics - - ..svc.cluster.local:/metrics - - ..svc.cluster.local:/metrics + - ..svc.cluster.local: + - ..svc.cluster.local: + - ..svc.cluster.local: + metrics_path: /metrics ``` ### Traces @@ -125,7 +127,7 @@ receivers: filelog: exclude: [] include: - - /var/log/pods/*/*/*.log + - /var/log/pods/*/*/*.log include_file_name: false include_file_path: true operators: @@ -137,22 +139,25 @@ receivers: start_at: end prometheus: - scrape_configs: - - job_name: database-metrics - scrape_interval: 15s - static_configs: - - targets: - - ..svc.cluster.local:/metrics - - ..svc.cluster.local:/metrics - - ..svc.cluster.local:/metrics - - job_name: service-metrics - scrape_interval: 15s - static_configs: - - targets: - - ..svc.cluster.local:/metrics - - ..svc.cluster.local:/metrics - - ..svc.cluster.local:/metrics - - ..svc.cluster.local:/metrics + config: + scrape_configs: + - job_name: database-metrics + scrape_interval: 15s + static_configs: + - targets: + - ..svc.cluster.local: + - ..svc.cluster.local: + - ..svc.cluster.local:..svc.cluster.local: + - ..svc.cluster.local: + - ..svc.cluster.local: + metrics_path: /metrics otlp: protocols: grpc: @@ -172,28 +177,28 @@ processors: error_mode: ignore logs: log_record: - - 'resource.attributes["k8s.namespace.name"] != ""' - - 'resource.attributes["k8s.app.name"] != ""' + - 'resource.attributes["k8s.namespace.name"] != ""' + - 'resource.attributes["k8s.app.name"] != ""' k8sattributes: extract: labels: - - from: pod - key: app.kubernetes.io/name - tag_name: k8s.app.name + - from: pod + key: app.kubernetes.io/name + tag_name: k8s.app.name metadata: - - k8s.namespace.name + - k8s.namespace.name exporters: otlphttp/logs: endpoint: tls: insecure: false -otlphttp/metrics: - endpoint: + otlphttp/metrics: + endpoint: tls: insecure: false -otlphttp/traces: - endpoint: + otlphttp/traces: + endpoint: tls: insecure: false From 60f1cdd460f52ac8857d08a3c3a35f97ccaf0b1b Mon Sep 17 00:00:00 2001 From: romain-priour-lc Date: Wed, 18 Jun 2025 17:13:47 -0700 Subject: [PATCH 6/7] much imporved docs --- .../observability/export_backend.mdx | 30 +- .../observability/langsmith_collector.mdx | 337 +++++++++++------- 2 files changed, 223 insertions(+), 144 deletions(-) diff --git a/docs/self_hosting/observability/export_backend.mdx b/docs/self_hosting/observability/export_backend.mdx index d775ce6a7..bf58c965e 100644 --- a/docs/self_hosting/observability/export_backend.mdx +++ b/docs/self_hosting/observability/export_backend.mdx @@ -19,7 +19,7 @@ Infrastructure refers to: - Collectors, such as [OpenTelemetry](https://opentelemetry.io/docs/collector/), [FluentBit](https://docs.fluentbit.io/manual) or [Prometheus](https://prometheus.io/) - Observability backends, such as [Datadog](https://www.datadoghq.com/) or [the Grafana ecosystem](https://grafana.com/) -## Logs: [OTel Example](./langsmith_collector#logs) +# Logs: [OTel Example](./langsmith_collector#logs) All services that are part of the LangSmith self-hosted deployment write their logs to their node/container filesystem. This includes Postgres, Redis and Clickhouse if you are running the default in-cluter versions. In order to access these logs, you need to set up your collector to read from those files. Most popular collectors support reading logs from container filsystems. @@ -30,45 +30,45 @@ Example file system integrations: - **FluentBit**: [Tail Input](https://docs.fluentbit.io/manual/pipeline/inputs/tail) - **Datadog**: [Kubernetes Log Collection](https://docs.datadoghq.com/containers/kubernetes/log/?tab=datadogoperator) -## Metrics: [OTel Example](./langsmith_collector#metrics) +# Metrics: [OTel Example](./langsmith_collector#metrics) - ### LangSmith Services + ## LangSmith Services The following LangSmith services expose metrics at an endpoint, in the Prometheus metrics format. - Backend: `http://..svc.cluster.local:1984/metrics` - Platform Backend: `http://..svc.cluster.local:1986/metrics` - Playground: `http://..svc.cluster.local:1988/metrics` - It is recommended to use a [Prometheus server](https://prometheus.io/docs/prometheus/latest/getting_started/#configure-prometheus-to-monitor-the-sample-targets) or - [OpenTelemetry collector](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) to scrape the endpoint, and export it to the + It is recommended to use a [Prometheus](https://prometheus.io/docs/prometheus/latest/getting_started/#configure-prometheus-to-monitor-the-sample-targets) or + [OpenTelemetry](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) collector to scrape the endpoints, and export metrics to the backend of your choice. :::warning Important **The following sections apply for in-cluster databases only. If you are using external databases, you will need to configure exposing and fetching metrics.** ::: - ### Redis + ## Redis If you are using the in-cluster Redis instance from the Helm chart, LangSmith can expose metrics for you if you upgrade the chart with the following values: ```yaml redis: metrics: enabled: true ``` - This will run a sidecar container alongside your redis container which will expose Prometheus metrics at: `http://langsmith-..svc.cluster.local:9121/metrics` + This will run a sidecar container alongside your Redis container, and the LangSmith Redis service will expose Prometheus metrics at: `http://langsmith-..svc.cluster.local:9121/metrics` - ### Postgres + ## Postgres Similarly, to expose Postgres metrics, upgrade the LangSmith Helm chart with the following values: ```yaml postgres: metrics: enabled: true ``` - This will run a sidecar container alongside Postgres, exposing Prometheus metrics at `http://langsmith-..svc.cluster.local:9187/metrics` + This will run a sidecar container alongside Postgres, and expose a Prometheus metrics endpoint at `http://langsmith-..svc.cluster.local:9187/metrics` :::note Note - **You can modify the Redis and Postgres exporter configurations through the LangSmith Helm chart.** + **You can modify the Redis and Postgres sidecar exporter configs through the LangSmith Helm chart.** ::: - ### Clickhouse - The Clickhouse container can expose metrics directly, without the need for a sidecar. To expose the metrics endpoint, run the LangSmith Helm chart with the + ## Clickhouse + The Clickhouse container can expose metrics directly, without the need for a sidecar. To expose the metrics endpoint, upgrade your LangSmith Helm chart with the following values: ```yaml clickhouse: @@ -77,7 +77,7 @@ Example file system integrations: ``` You can then scrape metrics at `http://langsmith-..svc.cluster.local:9363/metrics` -## Traces: [OTel Example](./langsmith_collector#traces) +# Traces: [OTel Example](./langsmith_collector#traces) The LangSmith Backend, Platform Backend, Playground and LangSmith Queue deployments have been instrumented using the OTEL SDK to emit traces adhering to the [OpenTelemetry format](https://opentelemetry.io/docs/concepts/signals/traces/). Tracing is toggled off by default, and can be enabled @@ -96,8 +96,8 @@ config: This will export traces from all LangSmith backend services to the specified endpoint. :::important Important -You can override the tracing endpoint for individual services. The Python apps require an endpoint in the form +You can override the tracing endpoint for individual services. From our testing, the Python apps require an endpoint in the form `http://host:port/v1/traces`, while the Go app requires the same endpoint in the form `host:port` to send to the same collector. -Make sure to check the logs of your services. If the endpoint is set correctly, there should be no logs. Otherwise, error logs will be shown. +Make sure to check the logs of your services. If the trace endpoint is set incorrectly, you should see error logs in your service. ::: diff --git a/docs/self_hosting/observability/langsmith_collector.mdx b/docs/self_hosting/observability/langsmith_collector.mdx index 9ff0ae953..c105060b3 100644 --- a/docs/self_hosting/observability/langsmith_collector.mdx +++ b/docs/self_hosting/observability/langsmith_collector.mdx @@ -15,65 +15,98 @@ Note that all of the concepts discussed below can be translated to other collect **This section is only applicable for Kubernetes deployments.** ::: -## Receivers +# Receivers -### Logs +## Logs -As discussed previously, logs are read from the filesystem of the nodes/containers running the application. An example configuration for reading logs from files: +As discussed previously, logs are read from the filesystem of the nodes/containers running the application. +This is an example for a Sidecar collector to read logs from its own container, excluding noisy logs from non-domain containers. +A Sidecar configuration is useful here because we require access to every container's filesystem. A DaemonSet can also be used. ```yaml filelog: - exclude: [] + exclude: + - "**/otc-container/*.log" + - "**/postgres-metrics-exporter/*.log" + - "**/redis-metrics-exporter/*.log" include: - - /var/log/pods/*/*/*.log + - /var/log/pods/${POD_NAMESPACE}_${POD_NAME}_${POD_UID}/*/*.log include_file_name: false include_file_path: true operators: - id: container-parser - max_log_size: 102400 type: container retry_on_failure: enabled: true start_at: end -``` - -:::note Note -**The above configuration reads logs from all files in the cluster that the collector has access to. If you would like to only read LangSmith logs, you need to either:** -1. **Only include files from containers in your LangSmith namespace.** - - or +env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid +volumes: + - name: varlogpods + hostPath: + path: /var/log/pods +volumeMounts: + - name: varlogpods + mountPath: /var/log/pods + readOnly: true +``` -2. **Filter out logs from other namespaces and/or application in your processing logic.** - ::: +:::info Note +**This configuration requires 'get', 'list', and 'watch' permissions on pods in the given namespace.** +::: -### Metrics +## Metrics -Metrics can be scraped using the Prometheus endpoints. The configuration below scraps all LangSmith database and service metrics: +Metrics can be scraped using the Prometheus endpoints. A single instance Gateway collector can be be used to avoid +duplication of queries when fetching metrics. The following config scrapes all of the default named LangSmith services: ```yaml prometheus: config: scrape_configs: - - job_name: database-metrics - scrape_interval: 15s - static_configs: - - targets: - - ..svc.cluster.local: - - ..svc.cluster.local: - - ..svc.cluster.local:..svc.cluster.local: - - ..svc.cluster.local: - - ..svc.cluster.local: - metrics_path: /metrics + # Only scrape endpoints in the LangSmith namespace + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: [] + relabel_configs: + # Only scrape services with the name langsmith-.* + - source_labels: [__meta_kubernetes_service_name] + regex: "langsmith-.*" + action: keep + # Only scrape ports with the following names + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: "(backend|platform|playground|redis-metrics|postgres-metrics|ch-metrics)" + action: keep + # Promote useful metadata into regular labels + - source_labels: [__meta_kubernetes_service_name] + target_label: k8s_service + - source_labels: [__meta_kubernetes_pod_name] + target_label: k8s_pod + # Replace the default "host:port" as Prom's instance label + - source_labels: [__address__] + target_label: instance ``` +:::info Note +**This configuration requires 'get', 'list', and 'watch' permissions on pods, services and endpoints in the given namespace.** +::: + ### Traces For traces, you need to enable the OTLP receiver. The following configuration can be used to listen to HTTP traces on port 4318, and GRPC on port 4317: @@ -105,115 +138,161 @@ Exporters just need to point to an external endpoint of your liking. The followi ```yaml otlphttp/logs: endpoint: - tls: false otlphttp/metrics: endpoint: - tls: false otlphttp/traces: endpoint: - tls: false ``` :::note Note **The OTel Collector also supports exporting directly to a [Datadog](https://docs.datadoghq.com/opentelemetry/setup/collector_exporter) endpoint.** ::: -# Example Collector Configuration: +# Example Collector Configuration: Logs Sidecar Note that this configuration uses a filter to drop any logs from files other than LangSmith logs in the deployment namespace. ```yaml -receivers: - filelog: - exclude: [] - include: - - /var/log/pods/*/*/*.log - include_file_name: false - include_file_path: true - operators: - - id: container-parser - max_log_size: 102400 - type: container - retry_on_failure: - enabled: true - start_at: end - - prometheus: - config: - scrape_configs: - - job_name: database-metrics - scrape_interval: 15s - static_configs: - - targets: - - ..svc.cluster.local: - - ..svc.cluster.local: - - ..svc.cluster.local:..svc.cluster.local: - - ..svc.cluster.local: - - ..svc.cluster.local: - metrics_path: /metrics - otlp: - protocols: - grpc: - endpoint: 0.0.0.0:4317 - http: - endpoint: 0.0.0.0:4318 - -processors: - batch: - send_batch_size: 8192 - timeout: 1s - memory_limiter: - check_interval: 1m - limit_percentage: 75 - spike_limit_percentage: 25 - filter: - error_mode: ignore - logs: - log_record: - - 'resource.attributes["k8s.namespace.name"] != ""' - - 'resource.attributes["k8s.app.name"] != ""' - k8sattributes: - extract: - labels: - - from: pod - key: app.kubernetes.io/name - tag_name: k8s.app.name - metadata: - - k8s.namespace.name - -exporters: - otlphttp/logs: - endpoint: - tls: - insecure: false - otlphttp/metrics: - endpoint: - tls: - insecure: false - otlphttp/traces: - endpoint: - tls: - insecure: false - -service: - pipelines: - logs/langsmith: - receivers: [filelog] - processors: [k8sattributes, filter, batch, memory_limiter] - exporters: [otlphttp/logs] - metrics/langsmith: - receivers: [prometheus] - processors: [batch, memory_limiter] - exporters: [otlphttp/metrics] - traces/langsmith: - receivers: [otlp] - processors: [batch, memory_limiter] - exporters: [otlphttp/traces] +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: otel-collector-sidecar +spec: + mode: sidecar + image: otel/opentelemetry-collector-contrib:0.123.0 + + config: + receivers: + filelog: + exclude: + - "**/otc-container/*.log" + - "**/postgres-metrics-exporter/*.log" + - "**/redis-metrics-exporter/*.log" + include: + - /var/log/pods/${POD_NAMESPACE}_${POD_NAME}_${POD_UID}/*/*.log + include_file_name: false + include_file_path: true + operators: + - id: container-parser + type: container + retry_on_failure: + enabled: true + start_at: end + + processors: + batch: + send_batch_size: 8192 + timeout: 10s + memory_limiter: + check_interval: 1m + limit_percentage: 90 + spike_limit_percentage: 80 + + exporters: + otlphttp/logs: + endpoint: + + service: + pipelines: + logs/langsmith: + receivers: [filelog] + processors: [batch, memory_limiter] + exporters: [otlphttp/logs] + + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + volumes: + - name: varlogpods + hostPath: + path: /var/log/pods + volumeMounts: + - name: varlogpods + mountPath: /var/log/pods + readOnly: true +``` + +# Example Collector Configuration: Metrics and Traces Gateway + +```yaml +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: otel-collector-gateway +spec: + mode: deployment + image: otel/opentelemetry-collector-contrib:0.123.0 + + config: + receivers: + prometheus: + config: + scrape_configs: + - job_name: langsmith-services + metrics_path: /metrics + scrape_interval: 15s + # Only scrape endpoints in the LangSmith namespace + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: [] + relabel_configs: + # Only scrape services with the name langsmith-.* + - source_labels: [__meta_kubernetes_service_name] + regex: "langsmith-.*" + action: keep + # Only scrape ports with the following names + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: "(backend|platform|playground|redis-metrics|postgres-metrics|ch-metrics)" + action: keep + # Promote useful metadata into regular labels + - source_labels: [__meta_kubernetes_service_name] + target_label: k8s_service + - source_labels: [__meta_kubernetes_pod_name] + target_label: k8s_pod + # Replace the default "host:port" as Prom's instance label + - source_labels: [__address__] + target_label: instance + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + processors: + batch: + send_batch_size: 8192 + timeout: 10s + memory_limiter: + check_interval: 1m + limit_percentage: 90 + spike_limit_percentage: 80 + + exporters: + otlphttp/metrics: + endpoint: + otlphttp/traces: + endpoint: + + service: + pipelines: + metrics/langsmith: + receivers: [prometheus] + processors: [batch, memory_limiter] + exporters: [otlphttp/metrics] + traces/langsmith: + receivers: [otlp] + processors: [batch, memory_limiter] + exporters: [otlphttp/traces] ``` From 53eadaf5866a261dbdda5e29303f18fa1cbcdbbc Mon Sep 17 00:00:00 2001 From: romain-priour-lc Date: Fri, 20 Jun 2025 08:43:49 -0700 Subject: [PATCH 7/7] review comments --- .../observability/export_backend.mdx | 3 +- .../observability/langsmith_collector.mdx | 262 +++++++++--------- 2 files changed, 128 insertions(+), 137 deletions(-) diff --git a/docs/self_hosting/observability/export_backend.mdx b/docs/self_hosting/observability/export_backend.mdx index bf58c965e..f602623e9 100644 --- a/docs/self_hosting/observability/export_backend.mdx +++ b/docs/self_hosting/observability/export_backend.mdx @@ -21,7 +21,7 @@ Infrastructure refers to: # Logs: [OTel Example](./langsmith_collector#logs) -All services that are part of the LangSmith self-hosted deployment write their logs to their node/container filesystem. This includes Postgres, Redis and Clickhouse if you are running the default in-cluter versions. +All services that are part of the LangSmith self-hosted deployment write their logs to their node/container filesystem. This includes Postgres, Redis and Clickhouse if you are running the in-cluster versions. In order to access these logs, you need to set up your collector to read from those files. Most popular collectors support reading logs from container filsystems. Example file system integrations: @@ -36,6 +36,7 @@ Example file system integrations: The following LangSmith services expose metrics at an endpoint, in the Prometheus metrics format. - Backend: `http://..svc.cluster.local:1984/metrics` - Platform Backend: `http://..svc.cluster.local:1986/metrics` + - Host Backend: `http://..svc.cluster.local:1985/metrics` - Playground: `http://..svc.cluster.local:1988/metrics` It is recommended to use a [Prometheus](https://prometheus.io/docs/prometheus/latest/getting_started/#configure-prometheus-to-monitor-the-sample-targets) or diff --git a/docs/self_hosting/observability/langsmith_collector.mdx b/docs/self_hosting/observability/langsmith_collector.mdx index c105060b3..a1b097816 100644 --- a/docs/self_hosting/observability/langsmith_collector.mdx +++ b/docs/self_hosting/observability/langsmith_collector.mdx @@ -153,146 +153,136 @@ otlphttp/traces: Note that this configuration uses a filter to drop any logs from files other than LangSmith logs in the deployment namespace. ```yaml -apiVersion: opentelemetry.io/v1beta1 -kind: OpenTelemetryCollector -metadata: - name: otel-collector-sidecar -spec: - mode: sidecar - image: otel/opentelemetry-collector-contrib:0.123.0 +mode: sidecar +image: otel/opentelemetry-collector-contrib:0.123.0 + +config: + receivers: + filelog: + exclude: + - "**/otc-container/*.log" + - "**/postgres-metrics-exporter/*.log" + - "**/redis-metrics-exporter/*.log" + include: + - /var/log/pods/${POD_NAMESPACE}_${POD_NAME}_${POD_UID}/*/*.log + include_file_name: false + include_file_path: true + operators: + - id: container-parser + type: container + retry_on_failure: + enabled: true + start_at: end + + processors: + batch: + send_batch_size: 8192 + timeout: 10s + memory_limiter: + check_interval: 1m + limit_percentage: 90 + spike_limit_percentage: 80 + + exporters: + otlphttp/logs: + endpoint: + + service: + pipelines: + logs/langsmith: + receivers: [filelog] + processors: [batch, memory_limiter] + exporters: [otlphttp/logs] - config: - receivers: - filelog: - exclude: - - "**/otc-container/*.log" - - "**/postgres-metrics-exporter/*.log" - - "**/redis-metrics-exporter/*.log" - include: - - /var/log/pods/${POD_NAMESPACE}_${POD_NAME}_${POD_UID}/*/*.log - include_file_name: false - include_file_path: true - operators: - - id: container-parser - type: container - retry_on_failure: - enabled: true - start_at: end - - processors: - batch: - send_batch_size: 8192 - timeout: 10s - memory_limiter: - check_interval: 1m - limit_percentage: 90 - spike_limit_percentage: 80 - - exporters: - otlphttp/logs: - endpoint: - - service: - pipelines: - logs/langsmith: - receivers: [filelog] - processors: [batch, memory_limiter] - exporters: [otlphttp/logs] - - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: POD_UID - valueFrom: - fieldRef: - fieldPath: metadata.uid - volumes: - - name: varlogpods - hostPath: - path: /var/log/pods - volumeMounts: - - name: varlogpods - mountPath: /var/log/pods - readOnly: true +env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid +volumes: + - name: varlogpods + hostPath: + path: /var/log/pods +volumeMounts: + - name: varlogpods + mountPath: /var/log/pods + readOnly: true ``` # Example Collector Configuration: Metrics and Traces Gateway ```yaml -apiVersion: opentelemetry.io/v1beta1 -kind: OpenTelemetryCollector -metadata: - name: otel-collector-gateway -spec: - mode: deployment - image: otel/opentelemetry-collector-contrib:0.123.0 - - config: - receivers: - prometheus: - config: - scrape_configs: - - job_name: langsmith-services - metrics_path: /metrics - scrape_interval: 15s - # Only scrape endpoints in the LangSmith namespace - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: [] - relabel_configs: - # Only scrape services with the name langsmith-.* - - source_labels: [__meta_kubernetes_service_name] - regex: "langsmith-.*" - action: keep - # Only scrape ports with the following names - - source_labels: [__meta_kubernetes_endpoint_port_name] - regex: "(backend|platform|playground|redis-metrics|postgres-metrics|ch-metrics)" - action: keep - # Promote useful metadata into regular labels - - source_labels: [__meta_kubernetes_service_name] - target_label: k8s_service - - source_labels: [__meta_kubernetes_pod_name] - target_label: k8s_pod - # Replace the default "host:port" as Prom's instance label - - source_labels: [__address__] - target_label: instance - otlp: - protocols: - grpc: - endpoint: 0.0.0.0:4317 - http: - endpoint: 0.0.0.0:4318 - - processors: - batch: - send_batch_size: 8192 - timeout: 10s - memory_limiter: - check_interval: 1m - limit_percentage: 90 - spike_limit_percentage: 80 - - exporters: - otlphttp/metrics: - endpoint: - otlphttp/traces: - endpoint: - - service: - pipelines: - metrics/langsmith: - receivers: [prometheus] - processors: [batch, memory_limiter] - exporters: [otlphttp/metrics] - traces/langsmith: - receivers: [otlp] - processors: [batch, memory_limiter] - exporters: [otlphttp/traces] +mode: deployment +image: otel/opentelemetry-collector-contrib + +config: + receivers: + prometheus: + config: + scrape_configs: + - job_name: langsmith-services + metrics_path: /metrics + scrape_interval: 15s + # Only scrape endpoints in the LangSmith namespace + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: [] + relabel_configs: + # Only scrape services with the name langsmith-.* + - source_labels: [__meta_kubernetes_service_name] + regex: "langsmith-.*" + action: keep + # Only scrape ports with the following names + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: "(backend|platform|playground|redis-metrics|postgres-metrics|ch-metrics)" + action: keep + # Promote useful metadata into regular labels + - source_labels: [__meta_kubernetes_service_name] + target_label: k8s_service + - source_labels: [__meta_kubernetes_pod_name] + target_label: k8s_pod + # Replace the default "host:port" as Prom's instance label + - source_labels: [__address__] + target_label: instance + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + processors: + batch: + send_batch_size: 8192 + timeout: 10s + memory_limiter: + check_interval: 1m + limit_percentage: 90 + spike_limit_percentage: 80 + + exporters: + otlphttp/metrics: + endpoint: + otlphttp/traces: + endpoint: + + service: + pipelines: + metrics/langsmith: + receivers: [prometheus] + processors: [batch, memory_limiter] + exporters: [otlphttp/metrics] + traces/langsmith: + receivers: [otlp] + processors: [batch, memory_limiter] + exporters: [otlphttp/traces] ```