langfuse · wochinge · Apr 10, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/.agents/cursor/rules/available-internal-links.mdc b/.agents/cursor/rules/available-internal-links.mdc
@@ -67,13 +67,15 @@ Pages: Self Hosting, Administration, Automated Access Provisioning, Headless Ini
 - [Core Concepts](/docs/evaluation/core-concepts.md)
 - [Annotation Queues](/docs/evaluation/evaluation-methods/annotation-queues.md)
 - [Llm As A Judge](/docs/evaluation/evaluation-methods/llm-as-a-judge.md)
-- [Score Analytics](/docs/evaluation/evaluation-methods/score-analytics.md)
 - [Scores Via Sdk](/docs/evaluation/evaluation-methods/scores-via-sdk.md)
 - [Scores Via Ui](/docs/evaluation/evaluation-methods/scores-via-ui.md)
-- [Data Model](/docs/evaluation/experiments/data-model.md)
+- [Scores Overview](/docs/evaluation/scores/overview.md)
+- [Score Analytics](/docs/evaluation/scores/score-analytics.md)
+- [Scores Data Model](/docs/evaluation/scores/data-model.md)
 - [Datasets](/docs/evaluation/experiments/datasets.md)
 - [Experiments Via Sdk](/docs/evaluation/experiments/experiments-via-sdk.md)
 - [Experiments Via Ui](/docs/evaluation/experiments/experiments-via-ui.md)
+- [Experiments Data Model](/docs/evaluation/experiments/data-model.md)
 - [Overview](/docs/evaluation/overview.md)
 - [Troubleshooting And Faq](/docs/evaluation/troubleshooting-and-faq.md)
 - [Glossary](/docs/glossary.md)

diff --git a/components/Glossary.tsx b/components/Glossary.tsx
@@ -384,15 +384,15 @@ const glossaryTerms: GlossaryTerm[] = [
     term: "Score",
     id: "score",
     definition: "The output of an evaluation. Scores can be numeric, categorical, or boolean and are assigned to traces, observations, sessions, or dataset runs.",
-    link: "/docs/evaluation/experiments/data-model#scores",
+    link: "/docs/evaluation/scores/data-model#scores",
     categories: ["EVALUATION"],
     relatedTerms: ["Score Config", "Evaluator", "LLM-as-a-Judge", "Annotation Queue"],
   },
   {
     term: "Score Config",
     id: "score-config",
     definition: "A configuration defining how a score is calculated and interpreted. Includes data type, value constraints, and categories for standardized scoring.",
-    link: "/docs/evaluation/experiments/data-model#score-config",
+    link: "/docs/evaluation/scores/data-model#score-config",
     categories: ["EVALUATION"],
     relatedTerms: ["Score", "LLM-as-a-Judge"],
   },

diff --git a/content/blog/2025-09-05-automated-evaluations.mdx b/content/blog/2025-09-05-automated-evaluations.mdx
@@ -62,7 +62,7 @@ For this guide, we will set up an evaluator for the "Out of Scope" failure mode.
 
 ## How to Measure [#how-to-measure]
 
-In Langfuse, all evaluations are tracked as **Scores**, which [can be attached to traces, observations, sessions or dataset runs](/docs/evaluation/experiments/data-model#scores). Evaluations in Langfuse can be set up in two main ways:
+In Langfuse, all evaluations are tracked as **Scores**, which [can be attached to traces, observations, sessions or dataset runs](/docs/evaluation/scores/overview). Evaluations in Langfuse can be set up in two main ways:
 
 **In the Langfuse UI:** In Langfuse, you can set up **LLM-as-a-Judge Evaluators** that use another LLM to evaluate your application's output on subjective and nuanced criteria. These are easily configured directly in Langfuse. For a guide on setting them up in the UI, check the documentation on **[LLM-as-a-Judge evaluators](/docs/evaluation/evaluation-methods/llm-as-a-judge)**.
 

diff --git a/content/changelog/2024-08-19-score-analytics.mdx b/content/changelog/2024-08-19-score-analytics.mdx
@@ -4,7 +4,7 @@ title: Advanced Score Analytics Charts
 description: Explore customizable charts for aggregate and time series analytics, grouped by score type, source and name.
 author: Marlies
 ogVideo: https://static.langfuse.com/docs-videos/score_analytics.mp4
-canonical: /docs/evaluation/evaluation-methods/score-analytics
+canonical: /docs/evaluation/scores/score-analytics
 ---
 
 import { ChangelogHeader } from "@/components/changelog/ChangelogHeader";

diff --git a/content/changelog/2025-04-28-session-level-scores.mdx b/content/changelog/2025-04-28-session-level-scores.mdx
@@ -4,7 +4,7 @@ description: Create and manage scores at the session level for more comprehensiv
 date: 2025-04-28
 author: Marlies
 ogVideo: https://static.langfuse.com/docs-videos/session_scores.mp4
-canonical: /docs/evaluation/experiments/data-model#scores
+canonical: /docs/evaluation/scores/data-model#scores
 ---
 
 import { ChangelogHeader } from "@/components/changelog/ChangelogHeader";

diff --git a/content/changelog/2025-05-07-run-level-scores.mdx b/content/changelog/2025-05-07-run-level-scores.mdx
@@ -3,7 +3,7 @@ title: Dataset Run Level Scores
 description: Score dataset runs to assess the overall quality of each run
 date: 2025-05-07
 author: Marlies
-canonical: /docs/evaluation/experiments/data-model#scores
+canonical: /docs/evaluation/scores/data-model#scores
 ---
 
 import { ChangelogHeader } from "@/components/changelog/ChangelogHeader";

diff --git a/content/changelog/2025-11-07-score-analytics-multi-score-comparison.mdx b/content/changelog/2025-11-07-score-analytics-multi-score-comparison.mdx
@@ -5,7 +5,7 @@ badge: Launch Week 4 🚀
 description: Validate evaluation reliability and uncover insights with comprehensive score analysis. Compare different evaluation methods, track trends over time, and measure agreement between human annotators and LLM judges.
 author: Michael
 ogImage: /images/changelog/score-analytics-compare-numeric.png
-canonical: /docs/evaluation/evaluation-methods/score-analytics
+canonical: /docs/evaluation/scores/score-analytics
 ---
 
 import { ChangelogHeader } from "@/components/changelog/ChangelogHeader";
@@ -82,7 +82,7 @@ Score Analytics provides a lightweight, zero-configuration way to analyze your s
 import { Book, Calendar } from "lucide-react";
 
 <Cards num={1}>
-  <Card title="Score Analytics Documentation" href="/docs/evaluation/evaluation-methods/score-analytics" icon={<Book />} />
+  <Card title="Score Analytics Documentation" href="/docs/evaluation/scores/score-analytics" icon={<Book />} />
   <Card title="Score Configuration Management" href="/faq/all/manage-score-configs" icon={<Book />} />
   <Card title="See all Launch Week releases" href="/blog/2025-10-29-launch-week-4" icon={<Calendar />} />
 </Cards>
diff --git a/content/docs/administration/billable-units.mdx b/content/docs/administration/billable-units.mdx
@@ -5,7 +5,7 @@ description: Learn how billable units are calculated in Langfuse.
 
 # Billable Units
 
-Langfuse Cloud [pricing](/pricing) is based on the number of ingested units per billing period. Units are either [traces](/docs/observability/data-model#traces), [observations](/docs/observability/data-model#observations) or [scores](/docs/evaluation/experiments/data-model#scores).
+Langfuse Cloud [pricing](/pricing) is based on the number of ingested units per billing period. Units are either [traces](/docs/observability/data-model#traces), [observations](/docs/observability/data-model#observations) or [scores](/docs/evaluation/scores/data-model#scores).
 
 `Units` = `Count of Traces` + `Count of Observations` + `Count of Scores`
 

diff --git a/content/docs/evaluation/core-concepts.mdx b/content/docs/evaluation/core-concepts.mdx
@@ -17,7 +17,7 @@ Ready to start?
 
 LLM applications often have a constant loop of testing and monitoring. 
 
-**Offline evaluation** lets you test your application against a fixed dataset before you deploy. You run your new prompt or model against test cases, review the scores, iterate until the results look good, then deploy your changes. In Langfuse, you can do that by running [Experiments](/docs/evaluation/core-concepts#experiments).
+**Offline evaluation** lets you test your application against a fixed dataset before you deploy. You run your new prompt or model against test cases, review the [scores](#scores), iterate until the results look good, then deploy your changes. In Langfuse, you can do that by running [Experiments](/docs/evaluation/core-concepts#experiments).
 
 **Online evaluation** scores live traces to catch issues in real traffic. When you find edge cases your dataset didn't cover, you add them back to your dataset so future experiments will catch them.
 
@@ -38,9 +38,15 @@ LLM applications often have a constant loop of testing and monitoring.
 > 
 > Over time, your dataset grows from a couple of examples to a diverse, representative set of real-world test cases.
 
+## Scores [#scores]
+
+[Scores](/docs/evaluation/scores/overview) are Langfuse's universal data object for storing evaluation results. Any time you want to assign a quality judgment to an LLM output, whether by a human annotation, an LLM judge, a programmatic check, or end-user feedback, the result is stored as a score.
+
+Scores can be attached to traces, observations, sessions, or dataset runs. Every score has a **name**, a **value**, and a **data type** (`NUMERIC`, `CATEGORICAL`, `BOOLEAN`, or `TEXT`). Learn more about [score types](/docs/evaluation/scores/overview#score-types), [how to create scores](/docs/evaluation/scores/overview#how-to-create-scores), and [score analytics](/docs/evaluation/scores/score-analytics) on the dedicated [Scores](/docs/evaluation/scores/overview) page.
+
 ## Evaluation Methods [#evaluation-methods]
 
-Evaluation methods are the functions that score traces, observations, sessions, or dataset runs. You can use a variety of evaluation methods to add [scores](/docs/evaluation/experiments/data-model#scores).
+Evaluation methods are the functions that score traces, observations, sessions, or dataset runs. You can use a variety of evaluation methods to add [scores](#scores).
 
 
 | Method | What | Use when |
@@ -50,7 +56,7 @@ Evaluation methods are the functions that score traces, observations, sessions,
 | [Annotation Queues](/docs/evaluation/evaluation-methods/annotation-queues) | Structured human review workflows with customizable queues | Building ground truth, systematic labeling, team collaboration |
 | [Scores via API/SDK](/docs/evaluation/evaluation-methods/scores-via-sdk) | Programmatically add scores using the Langfuse API or SDK | Custom evaluation pipelines, deterministic checks, automated workflows |
 
-When setting up new evaluation methods, you can use [Score Analytics](/docs/evaluation/evaluation-methods/score-analytics) to analyze or sense-check the scores you produce. 
+When setting up new evaluation methods, you can use [Score Analytics](/docs/evaluation/scores/score-analytics) to analyze or sense-check the scores you produce.
 ## Experiments [#experiments]
 
 An experiment runs your application against a dataset and evaluates the outputs. This is how you test changes before deploying to production.
@@ -65,7 +71,7 @@ Before diving into experiments, it's helpful to understand the building blocks i
 | **Dataset item** | One item in a dataset. Each dataset item contains an input (the scenario to test) and optionally an expected output. |
 | **Task** | The application code that you want to test in an experiment. This will be performed on each dataset item, and you will score the output.
 | **Evaluation Method** | A function that scores experiment results. In the context of a Langfuse experiment, this can be a [deterministic check](/docs/evaluation/evaluation-methods/custom-scores), or [LLM-as-a-Judge](/docs/evaluation/evaluation-methods/llm-as-a-judge). |
-| **Score** | The output of an evaluation. This can be numeric, categorical, or boolean. See [Scores](/docs/evaluation/experiments/data-model#scores) for more details.|
+| **Score** | The output of an evaluation. See [Scores](#scores) for the available data types and details.|
 | **Experiment Run** | A single execution of your task against all items in a dataset, producing outputs (and scores). |
 
 You can find the data model for these objects [here](/docs/evaluation/experiments/data-model).
@@ -85,7 +91,7 @@ Often, you want to score these experiment results. You can use various [evaluati
 
 You can compare experiment runs to see if a new prompt version improves scores, or identify specific inputs where your application struggles. Based on these experiment results, you can decide whether the change is ready to be deployed to production. 
 
-You can find more details on how these objects link together under the hood on the [data model page](/docs/evaluation/experiments/data-model). 
+You can find more details on how these objects link together under the hood on the [data model page](/docs/evaluation/experiments/data-model).
 
 
 ### Two ways to run experiments

diff --git a/content/docs/evaluation/evaluation-methods/annotation-queues.mdx b/content/docs/evaluation/evaluation-methods/annotation-queues.mdx
@@ -5,7 +5,7 @@ description: Manage your annotation tasks with ease using our new workflow tooli
 
 # Annotation Queues [#annotation-queues]
 
-Annotation Queues are a manual [evaluation method](/docs/evaluation/core-concepts#evaluation-methods) which is build for domain experts to add [scores](/docs/evaluation/evaluation-methods/data-model) and comments to traces, observations or sessions.
+Annotation Queues are a manual [evaluation method](/docs/evaluation/core-concepts#evaluation-methods) which is build for domain experts to add [scores](/docs/evaluation/scores/overview) and comments to traces, observations or sessions.
 
 <Video
   src="https://static.langfuse.com/docs-videos/2025-12-19-annotation-queues.mp4"
@@ -27,7 +27,7 @@ Annotation Queues are a manual [evaluation method](/docs/evaluation/core-concept
 ### Create a new Annotation Queue
 
 - Click on `New Queue` to create a new queue.
-- Select the [`Score Configs`](/docs/evaluation/experiments/data-model#score-config) you want to use for this queue.
+- Select the [`Score Configs`](/docs/evaluation/scores/data-model#score-config) you want to use for this queue.
 - Set the `Queue name` and `Description` (optional).
 - Assign users to the queue (optional).
 

diff --git a/content/docs/evaluation/evaluation-methods/llm-as-a-judge.mdx b/content/docs/evaluation/evaluation-methods/llm-as-a-judge.mdx
@@ -18,7 +18,7 @@ This approach has become one of the most popular methods for evaluating LLM appl
 
 ## How LLM-as-a-Judge Works
 
-The core idea is straightforward: present an LLM with the input, the application's output, and a scoring rubric, then ask it to evaluate the output. The judge model produces a [`score`](/docs/evaluation/core-concepts#scores) along with reasoning explaining its assessment.
+The core idea is straightforward: present an LLM with the input, the application's output, and a scoring rubric, then ask it to evaluate the output. The judge model produces a [`score`](/docs/evaluation/scores/overview) along with reasoning explaining its assessment.
 
 A typical LLM-as-a-Judge prompt includes:
 1. **Evaluation criteria** — a rubric defining what "good" looks like (e.g., "Score 1 if the answer is factually incorrect, 5 if fully accurate and well-sourced")

diff --git a/content/docs/evaluation/evaluation-methods/meta.json b/content/docs/evaluation/evaluation-methods/meta.json
@@ -4,7 +4,6 @@
     "llm-as-a-judge",
     "annotation-queues",
     "scores-via-ui",
-    "scores-via-sdk",
-    "score-analytics"
+    "scores-via-sdk"
   ]
 }