diff --git a/helm-frontend/src/components/Landing/CallCenterLanding.tsx b/helm-frontend/src/components/Landing/CallCenterLanding.tsx
deleted file mode 100644
index f00d5c8116..0000000000
--- a/helm-frontend/src/components/Landing/CallCenterLanding.tsx
+++ /dev/null
@@ -1,58 +0,0 @@
-import MiniLeaderboard from "@/components/MiniLeaderboard";
-import { Link } from "react-router-dom";
-
-export default function CallCenterLanding() {
-  return (
-    <div className="container mx-auto px-16">
-      <h1 className="text-3xl my-8 font-bold text-center">
-        HELM Call Center Leaderboard
-      </h1>
-      <div className="flex flex-col lg:flex-row gap-8">
-        <div className="flex-1 text-l">
-          <p className="my-4">
-            LLMs show great potential for applications for the call center, yet
-            there is a lack of domain-specific and ecologically-valid
-            evaluations in this domain. To address this, we introduce the{" "}
-            <strong className="font-bold">HELM Call Center leaderboard</strong>.
-            The HELM Call Center leaderboard evaluates leading LLMs on a
-            summarization task over a dataset of real helpdesk call transcripts
-            provided by Accenture. The quality of the summaries is evaluated
-            using LLM-as-judge with an ensemble of 3 models. We hope that this
-            leaderboard provides some initial insights into the potential of
-            LLMs in this domain.
-          </p>
-          <p className="my-4">
-            This leaderboard was produced through research collaboration with{" "}
-            <a
-              className="font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600"
-              href="https://www.accenture.com/"
-            >
-              Accenture
-            </a>
-            , and was funded by the{" "}
-            <a
-              className="font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600"
-              href="https://hai.stanford.edu/corporate-affiliate-program"
-            >
-              HAI Corporate Affiliate Program
-            </a>
-            .
-          </p>
-        </div>
-        <div
-          className="py-2 pb-6 rounded-3xl bg-gray-100 h-full" // Stretched to full height
-          style={{ maxWidth: "100%" }}
-        >
-          <MiniLeaderboard />
-          <div className="flex justify-end">
-            <Link to="leaderboard">
-              <button className="px-4 mx-3 mt-1 btn bg-white rounded-md">
-                <span>See More</span>
-              </button>
-            </Link>
-          </div>
-        </div>
-      </div>
-    </div>
-  );
-}
diff --git a/helm-frontend/src/components/Landing/CallTranscriptSummarizationLanding.tsx b/helm-frontend/src/components/Landing/CallTranscriptSummarizationLanding.tsx
deleted file mode 100644
index f80eff31ff..0000000000
--- a/helm-frontend/src/components/Landing/CallTranscriptSummarizationLanding.tsx
+++ /dev/null
@@ -1,82 +0,0 @@
-import MiniLeaderboard from "@/components/MiniLeaderboard";
-import { Link } from "react-router-dom";
-
-export default function CallTranscriptSummarizationLanding() {
-  return (
-    <div className="container mx-auto px-16">
-      <h1 className="text-3xl my-8 font-bold text-center">
-        HELM Call Transcript Summarization Leaderboard
-      </h1>
-      <div className="flex flex-col lg:flex-row gap-8">
-        <div className="flex-[3] text-l">
-          <img
-            src="https://storage.googleapis.com/crfm-helm-public/assets/images/call-center-banner-narrow.jpg"
-            className="shadow-xl"
-            width="1200"
-            height="400"
-          />
-          <p className="my-4">
-            Large language models (LLMs) show great potential for call center
-            applications, yet there is a lack of domain-specific and
-            ecologically valid evaluations in this domain. To address this, we
-            introduce the{" "}
-            <strong className="font-bold">
-              HELM Call Transcript Summarization
-            </strong>{" "}
-            leaderboard, which evaluates leading LLMs on a summarization task
-            over a dataset of real call transcripts provided by Accenture.
-          </p>
-          <p className="my-4">
-            This dataset consists of 162 transcribed calls to an internal
-            corporate IT helpdesk. The calls were transcribed using an automatic
-            speech recognition (ASR) model. Transcription errors were
-            deliberately left uncorrected to reflect the nature of real-life
-            transcripts. The transcripts were anonymized using a semi-automated
-            process with human verification.
-          </p>
-          <p className="my-4">
-            To evaluate the LLMs, summaries of the transcripts were generated
-            using 17 LLMs. The quality of the generated summaries were then
-            evaluated using LLM-as-judge with an ensemble of 3 models.
-          </p>
-          <p className="my-4">
-            As with all HELM leaderboards, this leaderboard provides full
-            transparency into all LLM requests and responses, and the results
-            are reproducible using the HELM open source framework. We hope that
-            this leaderboard offers initial insights into the potential of LLMs
-            for this task.
-          </p>
-          <p className="my-4">
-            This leaderboard was produced through research collaboration with{" "}
-            <a
-              className="font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600"
-              href="https://www.accenture.com/"
-            >
-              Accenture
-            </a>
-            , and was funded by the{" "}
-            <a
-              className="font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600"
-              href="https://hai.stanford.edu/corporate-affiliate-program"
-            >
-              HAI Corporate Affiliate Program
-            </a>
-            .
-          </p>
-        </div>
-        <div
-          className="flex-[2] py-2 rounded-3xl bg-gray-100 h-full" // Stretched to full height
-        >
-          <MiniLeaderboard />
-          <div className="flex justify-end">
-            <Link to="leaderboard">
-              <button className="px-4 mx-3 mt-1 btn bg-white rounded-md">
-                <span>See More</span>
-              </button>
-            </Link>
-          </div>
-        </div>
-      </div>
-    </div>
-  );
-}
diff --git a/helm-frontend/src/routes/Home.tsx b/helm-frontend/src/routes/Home.tsx
index c33a391f64..144ac5258d 100644
--- a/helm-frontend/src/routes/Home.tsx
+++ b/helm-frontend/src/routes/Home.tsx
@@ -6,8 +6,6 @@ import ThaiExamLanding from "@/components/Landing/ThaiExamLanding";
 import FinanceLanding from "@/components/Landing/FinanceLanding";
 import HEIMLanding from "@/components/Landing/HEIMLanding";
 import VHELMLanding from "@/components/VHELMLanding";
-import CallCenterLanding from "@/components/Landing/CallCenterLanding";
-import CallTranscriptSummarizationLanding from "@/components/Landing/CallTranscriptSummarizationLanding";
 import CLEVALanding from "@/components/Landing/CLEVALanding";
 import ToRRLanding from "@/components/Landing/ToRRLanding";
 import HomeLanding from "@/components/Landing/HomeLanding";
@@ -48,10 +46,6 @@ export default function Home() {
     return <ThaiExamLanding />;
   } else if (window.PROJECT_ID === "finance") {
     return <FinanceLanding />;
-  } else if (window.PROJECT_ID === "call-center") {
-    return <CallCenterLanding />;
-  } else if (window.PROJECT_ID === "call-transcript-summarization") {
-    return <CallTranscriptSummarizationLanding />;
   } else if (window.PROJECT_ID === "cleva") {
     return <CLEVALanding />;
   } else if (window.PROJECT_ID === "torr") {
diff --git a/src/helm/benchmark/annotation/call_center_annotator.py b/src/helm/benchmark/annotation/call_center_annotator.py
deleted file mode 100644
index 7176de4ffd..0000000000
--- a/src/helm/benchmark/annotation/call_center_annotator.py
+++ /dev/null
@@ -1,258 +0,0 @@
-import json
-from json.decoder import JSONDecodeError
-import textwrap
-from typing import Any
-
-from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.annotation.annotator import Annotator
-from helm.clients.auto_client import AutoClient
-from helm.common.hierarchical_logger import hlog
-from helm.common.request import Request
-
-
-class CallCenterSummarizationAnnotator(Annotator):
-    """Annotator for call center summarization."""
-
-    name = "call_center_summarization"
-
-    PROMPT_TEMPLATE = """\
-    Score the summary by scoring it on a five-point scale between 1 and 5 using three criteria: Faithfulness, Relevance, and Coherence.
-
-    ### Criteria
-    Faithfulness: Can all the information expressed by the summary can be inferred from the source? (1 = not at all, 5 = very much)
-    Relevance: To what extent the summary include only important information from the source? (1 = not at all, 5 = very much)
-    Coherence: Does the summary organize the relevant information into a well-structured summary? (1 = not at all, 5 = very much)
-
-    ### Call Transcript
-    {{CALL_TRANSCRIPT}}
-
-    ### Summary
-    {{SUMMARY}}
-
-    ### Task
-    Respond with only a raw JSON object in the following format, without using Markdown formatting:
-
-    {"faithfulness": <score>, "relevance": <score>, "coherence": <score>}
-    """  # noqa: E501
-
-    CRITERIA = [
-        "faithfulness",
-        "relevance",
-        "coherence",
-    ]
-
-    def __init__(self, auto_client: AutoClient):
-        super().__init__()
-        self._auto_client = auto_client
-
-    def annotate(self, request_state: RequestState) -> Any:
-        assert request_state.result
-        assert len(request_state.result.completions) == 1
-        call_transcript = request_state.instance.input.text
-        summary = request_state.result.completions[0].text.strip()
-        if not summary.strip():
-            hlog("Returning 0 scores due to empty response")
-            return {"faithfulness": 0, "relevance": 0, "coherence": 0}
-        annotator_prompt = (
-            textwrap.dedent(CallCenterSummarizationAnnotator.PROMPT_TEMPLATE)
-            .replace("{{CALL_TRANSCRIPT}}", call_transcript)
-            .replace("{{SUMMARY}}", summary)
-        )
-        annotator_request = Request(
-            model="openai/gpt-4o-mini-2024-07-18",
-            model_deployment="openai/gpt-4o-mini-2024-07-18",
-            prompt=annotator_prompt,
-            temperature=0.0,
-            max_tokens=256,
-        )
-        annotator_response = self._auto_client.make_request(annotator_request)
-        if not annotator_response.success:
-            raise Exception(f"Annotation request failed: {annotator_response.error}")
-        assert len(annotator_response.completions) == 1
-        annotator_response_text = annotator_response.completions[0].text
-        # OpenAI models like to surround JSON objects with ```json and ``` Markdown formatting.
-        # This strips everything outside the outermost {} brackets.
-        json_start_index = annotator_response_text.find("{")
-        json_end_index = annotator_response_text.rfind("}")
-        if json_start_index < 0 or json_end_index < 0:
-            raise Exception(f"Malformed annotator response: {annotator_response_text}")
-        annotator_response_json = annotator_response_text[json_start_index : json_end_index + 1]
-        try:
-            annotator_response_parsed = json.loads(annotator_response_json)
-        except JSONDecodeError:
-            raise Exception(f"Malformed annotator response: {annotator_response_text}")
-        for expected_key in CallCenterSummarizationAnnotator.CRITERIA:
-            if expected_key not in annotator_response_parsed:
-                raise Exception(f"Malformed annotator response: {annotator_response_text}")
-        return annotator_response_parsed
-
-
-class CallCenterSummarizationPairwiseComparisonAnnotator(Annotator):
-    """Annotator for call center summarization with pairwise comparison."""
-
-    name = "call_center_summarization_pairwise_comparison"
-
-    PROMPT_TEMPLATE = """\
-    Given a call transcript and two different summaries of the call transcript, select your preferred summary, which can be subjective, considering the criteria below. Also provide a one-sentence reasoning for your selection.
-
-    ### Criteria
-    Faithfulness: Can all the information expressed by the summary can be inferred from the source?
-    Relevance: To what extent the summary include only important information from the source?
-    Coherence: Does the summary organize the relevant information into a well-structured summary?
-
-    ### Call Transcript
-    {{CALL_TRANSCRIPT}}
-
-    ### Summary A
-    {{SUMMARY_A}}
-
-    ### Summary B
-    {{SUMMARY_B}}
-
-    ### Task
-    Output only a JSON object with the following format:
-
-    {"reasoning": "Reasoning", "selected": "A" or "B"}
-    """  # noqa: E501
-
-    def __init__(self, auto_client: AutoClient):
-        super().__init__()
-        self._auto_client = auto_client
-
-    def annotate(self, request_state: RequestState) -> Any:
-        assert request_state.result
-        assert len(request_state.result.completions) == 1
-        call_transcript = request_state.instance.input.text
-        summary = request_state.result.completions[0].text.strip()
-        assert len(request_state.instance.all_correct_references) == 1
-        reference_summary = request_state.instance.all_correct_references[0].output.text
-        if not summary.strip():
-            hlog("Returning 0 scores due to empty response")
-            return {"faithfulness": 0, "relevance": 0, "coherence": 0}
-        assert request_state.instance.id is not None
-        instance_id = int(request_state.instance.id[2:])
-        if instance_id % 2:
-            reference_option = "A"
-            summary_a = reference_summary
-            summary_b = summary
-        else:
-            reference_option = "B"
-            summary_a = summary
-            summary_b = reference_summary
-        annotator_prompt = (
-            textwrap.dedent(CallCenterSummarizationPairwiseComparisonAnnotator.PROMPT_TEMPLATE)
-            .replace("{{CALL_TRANSCRIPT}}", call_transcript)
-            .replace("{{SUMMARY_B}}", summary_a)
-            .replace("{{SUMMARY_A}}", summary_b)
-        )
-        annotator_request = Request(
-            model="openai/gpt-4o-2024-08-06",
-            model_deployment="openai/gpt-4o-2024-08-06",
-            prompt=annotator_prompt,
-            temperature=0.0,
-            max_tokens=256,
-        )
-        annotator_response = self._auto_client.make_request(annotator_request)
-        if not annotator_response.success:
-            raise Exception(f"Annotation request failed: {annotator_response.error}")
-        assert len(annotator_response.completions) == 1
-        annotator_response_text = annotator_response.completions[0].text
-        # OpenAI models like to surround JSON objects with ```json and ``` Markdown formatting.
-        # This strips everything outside the outermost {} brackets.
-        json_start_index = annotator_response_text.find("{")
-        json_end_index = annotator_response_text.rfind("}")
-        if json_start_index < 0 or json_end_index < 0:
-            raise Exception(f"Malformed annotator response: {annotator_response_text}")
-        annotator_response_json = annotator_response_text[json_start_index : json_end_index + 1]
-        try:
-            annotator_response_parsed = json.loads(annotator_response_json)
-        except JSONDecodeError:
-            raise Exception(f"Malformed annotator response: {annotator_response_text}")
-        for expected_key in ["reasoning", "selected"]:
-            if expected_key not in annotator_response_parsed:
-                raise Exception(f"Malformed annotator response: {annotator_response_text}")
-        score = 0.0
-        selected = annotator_response_parsed["selected"].strip()
-        if selected != "A" and selected != "B":
-            raise Exception(f"Malformed annotator response: {annotator_response_text}")
-        if selected == reference_option:
-            score = 0.0
-        else:
-            score = 1.0
-        return {
-            "reasoning": annotator_response_parsed["reasoning"],
-            "selected": selected,
-            "reference_option": reference_option,
-            "score": score,
-        }
-
-
-class CallCenterSummarizationKeyPointsRecallAnnotator(Annotator):
-    """Annotator for call center summarization with key point recall."""
-
-    name = "call_center_summarization_key_points_recall"
-
-    PROMPT_TEMPLATE = """\
-    Given a call transcript, a list of key points and a summary, determine which key points are included in the summary.
-
-    ### Call Transcript
-    {{CALL_TRANSCRIPT}}
-
-    ### Key Points
-    {{KEY_POINTS}}
-
-    ### Summary
-    {{SUMMARY}}
-
-    ### Task
-    Output only a JSON array of booleans, where each boolean indicates if the corresponding key point was included in the summary.
-    """  # noqa: E501
-
-    def __init__(self, auto_client: AutoClient):
-        super().__init__()
-        self._auto_client = auto_client
-
-    def annotate(self, request_state: RequestState) -> Any:
-        assert request_state.result
-        assert len(request_state.result.completions) == 1
-        call_transcript = request_state.instance.input.text
-        summary = request_state.result.completions[0].text.strip()
-        key_points = "\n".join(
-            [f"- {reference.output.text}" for reference in request_state.instance.all_correct_references]
-        )
-        if not summary.strip():
-            hlog("Returning 0 scores due to empty response")
-            return {"faithfulness": 0, "relevance": 0, "coherence": 0}
-        annotator_prompt = (
-            textwrap.dedent(CallCenterSummarizationKeyPointsRecallAnnotator.PROMPT_TEMPLATE)
-            .replace("{{CALL_TRANSCRIPT}}", call_transcript)
-            .replace("{{KEY_POINTS}}", key_points)
-            .replace("{{SUMMARY}}", summary)
-        )
-        annotator_request = Request(
-            model="openai/gpt-4o-2024-08-06",
-            model_deployment="openai/gpt-4o-2024-08-06",
-            prompt=annotator_prompt,
-            temperature=0.0,
-            max_tokens=256,
-        )
-        annotator_response = self._auto_client.make_request(annotator_request)
-        if not annotator_response.success:
-            raise Exception(f"Annotation request failed: {annotator_response.error}")
-        assert len(annotator_response.completions) == 1
-        annotator_response_text = annotator_response.completions[0].text
-        # OpenAI models like to surround JSON objects with ```json and ``` Markdown formatting.
-        # This strips everything outside the outermost [] brackets.
-        json_start_index = annotator_response_text.find("[")
-        json_end_index = annotator_response_text.rfind("]")
-        if json_start_index < 0 or json_end_index < 0:
-            raise Exception(f"Malformed annotator response: {annotator_response_text}")
-        annotator_response_json = annotator_response_text[json_start_index : json_end_index + 1]
-        try:
-            annotator_response_parsed = json.loads(annotator_response_json)
-        except JSONDecodeError:
-            raise Exception(f"Malformed annotator response: {annotator_response_text}")
-        if not len(annotator_response_parsed):
-            raise Exception(f"Malformed annotator response: {annotator_response_text}")
-        score = sum([1.0 if elem else 0.0 for elem in annotator_response_parsed]) / len(annotator_response_parsed)
-        return {"key_points_found": json.dumps(annotator_response_parsed), "score": score}
diff --git a/src/helm/benchmark/metrics/helpdesk_call_summarization_metrics.py b/src/helm/benchmark/metrics/helpdesk_call_summarization_metrics.py
deleted file mode 100644
index 21020b15b4..0000000000
--- a/src/helm/benchmark/metrics/helpdesk_call_summarization_metrics.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from typing import Any, Dict, List
-
-from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.metrics.metric import Metric, MetricMetadata
-from helm.benchmark.metrics.metric_name import MetricName
-from helm.benchmark.metrics.metric_service import MetricService
-from helm.benchmark.metrics.statistic import Stat
-
-
-class HelpdeskCallSummarizationMetric(Metric):
-    """Score metrics for helpdesk call summarization."""
-
-    def evaluate_generation(
-        self,
-        adapter_spec: AdapterSpec,
-        request_state: RequestState,
-        metric_service: MetricService,
-        eval_cache_path: str,
-    ) -> List[Stat]:
-        assert request_state.annotations
-        annotations: Dict[str, Any] = request_state.annotations["helpdesk_call_center_summarization"]
-        scores: List[int] = []
-        for annotation_key, annotation_value in annotations.items():
-            if annotation_key.endswith("_score") and annotation_value is not None:
-                scores.append(annotation_value)
-        if not scores:
-            raise ValueError(
-                "Could not compute score in HelpdeskCallSummarizationMetric because all annotators failed."
-            )
-        score = sum(scores) / len(scores)
-        # normalize score
-        score = (score - 1) / 9
-        return [
-            Stat(MetricName("call_summarization_score")).add(score),
-        ]
-
-    def get_metadata(self) -> List[MetricMetadata]:
-        return [
-            MetricMetadata(
-                name="call_summarization_score",
-                display_name="Score",
-                short_display_name="Score",
-                description="Score",
-                lower_is_better=False,
-                group="summarization_metrics",
-            ),
-        ]
diff --git a/src/helm/benchmark/run_specs/call_center_run_specs.py b/src/helm/benchmark/run_specs/call_center_run_specs.py
deleted file mode 100644
index d1eaded8b3..0000000000
--- a/src/helm/benchmark/run_specs/call_center_run_specs.py
+++ /dev/null
@@ -1,201 +0,0 @@
-"""Run specs for experiments only.
-
-These run specs are not intended for use with public leaderboards."""
-
-from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
-from helm.benchmark.annotation.annotator import AnnotatorSpec
-from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
-from helm.benchmark.metrics.metric import MetricSpec
-from helm.benchmark.run_spec import RunSpec, run_spec_function
-from helm.benchmark.scenarios.scenario import ScenarioSpec
-
-
-@run_spec_function("call_center_summarization")
-def get_call_center_summarization_spec(subset: str = "summarization") -> RunSpec:
-    from helm.benchmark.annotation.call_center_annotator import CallCenterSummarizationAnnotator
-
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
-        args={"subset": subset},
-    )
-
-    instructions = "Summarize the call transcript in under 10 sentences."
-
-    adapter_spec = AdapterSpec(
-        method=ADAPT_GENERATION,
-        instructions=instructions,
-        input_prefix="### Call Transcript\n",
-        input_suffix="",
-        output_prefix="",
-        output_suffix="",
-        max_train_instances=0,
-        temperature=0.0,
-        max_tokens=512,
-        num_outputs=1,
-    )
-
-    annotator_specs = annotator_specs = [
-        AnnotatorSpec(class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator")
-    ]
-    annotation_metric_specs = [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
-            args={
-                "annotator_name": CallCenterSummarizationAnnotator.name,
-                "key": criterion,
-                "min_score": 1,
-                "max_score": 5,
-            },
-        )
-        for criterion in CallCenterSummarizationAnnotator.CRITERIA
-    ]
-
-    metric_specs = get_basic_metric_specs([]) + annotation_metric_specs
-
-    group = "call_center_summarization" if subset == "summarization" else f"call_center_summarization_{subset}"
-
-    return RunSpec(
-        name="call_center_summarization",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=metric_specs,
-        annotators=annotator_specs,
-        groups=[group],
-    )
-
-
-@run_spec_function("call_center_summarization_pairwise_comparison")
-def get_call_center_summarization_pairwise_comparison_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario",
-    )
-
-    instructions = "Summarize the call transcript in under 10 sentences."
-
-    adapter_spec = AdapterSpec(
-        method=ADAPT_GENERATION,
-        instructions=instructions,
-        input_prefix="### Call Transcript\n",
-        input_suffix="",
-        output_prefix="",
-        output_suffix="",
-        max_train_instances=0,
-        temperature=0.0,
-        max_tokens=512,
-        num_outputs=1,
-    )
-
-    annotator_specs = annotator_specs = [
-        AnnotatorSpec(
-            class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator"  # noqa: E501
-        )
-    ]
-
-    metric_specs = get_basic_metric_specs([]) + [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
-            args={"annotator_name": "call_center_summarization_pairwise_comparison", "key": "score"},
-        )
-    ]
-
-    return RunSpec(
-        name="call_center_summarization_pairwise_comparison",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=metric_specs,
-        annotators=annotator_specs,
-        groups=["call_center_summarization_pairwise_comparison"],
-    )
-
-
-@run_spec_function("call_center_summarization_key_points_recall")
-def get_call_center_summarization_key_points_recall_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationKeyPointsRecallScenario",
-    )
-
-    instructions = "Summarize the call transcript in under 10 sentences."
-
-    adapter_spec = AdapterSpec(
-        method=ADAPT_GENERATION,
-        instructions=instructions,
-        input_prefix="### Call Transcript\n",
-        input_suffix="",
-        output_prefix="",
-        output_suffix="",
-        max_train_instances=0,
-        temperature=0.0,
-        max_tokens=512,
-        num_outputs=1,
-    )
-
-    annotator_specs = annotator_specs = [
-        AnnotatorSpec(
-            class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationKeyPointsRecallAnnotator"
-        )
-    ]
-
-    metric_specs = get_basic_metric_specs([]) + [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
-            args={"annotator_name": "call_center_summarization_key_points_recall", "key": "score"},
-        )
-    ]
-
-    return RunSpec(
-        name="call_center_summarization_key_points_recall",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=metric_specs,
-        annotators=annotator_specs,
-        groups=["call_center_summarization_key_points_recall"],
-    )
-
-
-@run_spec_function("helpdesk_call_summarization")
-def get_helpdesk_call_summarization_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.helpdesk_call_summarization_scenario.HelpdeskCallSummarizationScenario",
-    )
-    annotator_specs = annotator_specs = [
-        AnnotatorSpec(
-            class_name="helm.benchmark.annotation.helpdesk_call_summarization_annotator.HelpdeskCallSummarizationAnnotator"  # noqa: E501
-        )
-    ]
-
-    instructions = "The following is a call transcript of a call between a compnay's employee and the company's IT helpdesk. Summarize the call transcript in under 200 words."  # noqa: E501
-
-    adapter_spec = AdapterSpec(
-        method=ADAPT_GENERATION,
-        instructions=instructions,
-        input_prefix="### Call Transcript\n",
-        input_suffix="",
-        output_prefix="",
-        output_suffix="",
-        max_train_instances=0,
-        temperature=0.0,
-        max_tokens=512,
-        num_outputs=1,
-    )
-
-    # annotator_specs = annotator_specs = [
-    #     AnnotatorSpec(class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator")
-    # ]
-    annotation_metric_specs = [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.helpdesk_call_summarization_metrics.HelpdeskCallSummarizationMetric",
-        ),
-    ]
-
-    metric_specs = get_basic_metric_specs([]) + annotation_metric_specs
-
-    group = "helpdesk_call_summarization"
-
-    return RunSpec(
-        name="helpdesk_call_summarization",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=metric_specs,
-        annotators=annotator_specs,
-        groups=[group],
-    )
diff --git a/src/helm/benchmark/scenarios/call_center_scenario.py b/src/helm/benchmark/scenarios/call_center_scenario.py
deleted file mode 100644
index 724bf8049f..0000000000
--- a/src/helm/benchmark/scenarios/call_center_scenario.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import datasets
-import os
-from typing import List
-
-from helm.benchmark.scenarios.scenario import (
-    CORRECT_TAG,
-    Output,
-    Reference,
-    Scenario,
-    Instance,
-    TEST_SPLIT,
-    Input,
-)
-from helm.common.general import ensure_directory_exists
-
-
-class CallCenterSummarizationScenario(Scenario):
-    """Call center summarization."""
-
-    name = "call_center_summarization"
-    description = "Call center summarization."
-    tags = ["call_center"]
-
-    def __init__(self, subset: str):
-        super().__init__()
-        self.subset = subset
-
-    def get_instances(self, output_path: str) -> List[Instance]:
-        cache_dir = os.path.join(output_path, "data")
-        ensure_directory_exists(cache_dir)
-        dataset = datasets.load_dataset("yifanmai/call-center", self.subset, split="test", cache_dir=cache_dir)
-        instances: List[Instance] = []
-        for row in dataset:
-            input = Input(text=row["transcript"])
-            instance = Instance(input=input, references=[], split=TEST_SPLIT)
-            instances.append(instance)
-        return instances
-
-
-class CallCenterSummarizationPairwiseComparisonScenario(Scenario):
-    """Call center summarization."""
-
-    name = "call_center_summarization_pairwise_comparison"
-    description = "Call center summarization."
-    tags = ["call_center"]
-
-    def get_instances(self, output_path: str) -> List[Instance]:
-        cache_dir = os.path.join(output_path, "data")
-        ensure_directory_exists(cache_dir)
-        dataset = datasets.load_dataset(
-            "yifanmai/call-center", "summarization_with_annotations", split="test", cache_dir=cache_dir
-        )
-        instances: List[Instance] = []
-        for row in dataset:
-            input = Input(text=row["transcript"])
-            reference = Reference(output=Output(text=row["gpt-4o-mini-2024-07-18_summary"]), tags=[CORRECT_TAG])
-            instance = Instance(input=input, references=[reference], split=TEST_SPLIT)
-            instances.append(instance)
-        return instances
-
-
-class CallCenterSummarizationKeyPointsRecallScenario(Scenario):
-    """Call center summarization."""
-
-    name = "call_center_summarization_key_points_recall"
-    description = "Call center summarization."
-    tags = ["call_center"]
-
-    def get_instances(self, output_path: str) -> List[Instance]:
-        cache_dir = os.path.join(output_path, "data")
-        ensure_directory_exists(cache_dir)
-        dataset = datasets.load_dataset(
-            "yifanmai/call-center", "summarization_with_annotations", split="test", cache_dir=cache_dir
-        )
-        instances: List[Instance] = []
-        for row in dataset:
-            input = Input(text=row["transcript"])
-            references = [
-                Reference(output=Output(text=key_point), tags=[CORRECT_TAG])
-                for key_point in row["gpt-4o-mini-2024-07-18_key_points"]
-            ]
-            instance = Instance(input=input, references=references, split=TEST_SPLIT)
-            instances.append(instance)
-        return instances
diff --git a/src/helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py b/src/helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py
deleted file mode 100644
index 57409b8921..0000000000
--- a/src/helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import csv
-import os
-from typing import List
-
-from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
-from helm.benchmark.scenarios.scenario import (
-    Scenario,
-    Instance,
-    TEST_SPLIT,
-    Input,
-    ScenarioMetadata,
-)
-
-
-_DATA_DIRRECTORY_PATH = "restricted/helpdesk_call_summarization/HELM Sample Transcripts_20241221_0045"
-
-
-class HelpdeskCallSummarizationScenario(Scenario):
-    """Helpdesk call summarization."""
-
-    name = "helpdesk_call_summarization"
-    description = "Helpdesk call summarization."
-    tags = ["helpdesk_call_center"]
-
-    def get_instances(self, output_path: str) -> List[Instance]:
-        instances: List[Instance] = []
-        for file_name in os.listdir(_DATA_DIRRECTORY_PATH):
-            if not file_name.endswith(".csv") or not file_name.startswith("Call1-"):
-                continue
-            file_path = os.path.join(_DATA_DIRRECTORY_PATH, file_name)
-            with open(file_path) as f:
-                csv_reader = csv.reader(f)
-                prompt_lines = [f"{row[0]}: {row[4]}" for row in csv_reader]
-            prompt = "\n".join(prompt_lines)
-            instance_id = file_name.removeprefix("Call1-").removesuffix(".csv")
-            input = Input(text=prompt)
-            instance = Instance(id=instance_id, input=input, references=[], split=TEST_SPLIT)
-            instances.append(instance)
-        return instances
-
-    def get_metadata(self) -> ScenarioMetadata:
-        return ScenarioMetadata(
-            name="helpdesk_call_summarization",
-            display_name="Helpdesk Call summarization",
-            short_display_name=None,
-            description="Helpdesk Call summarization",
-            taxonomy=TaxonomyInfo(task="summarization", what="n/a", when="?", who="n/a", language="English"),
-            main_metric="unknown",
-            main_split="test",
-        )
diff --git a/src/helm/benchmark/static/schema_call_center.yaml b/src/helm/benchmark/static/schema_call_center.yaml
deleted file mode 100644
index 7f222470b5..0000000000
--- a/src/helm/benchmark/static/schema_call_center.yaml
+++ /dev/null
@@ -1,269 +0,0 @@
----
-############################################################
-metrics:
-  # Infrastructure metrics:
-  - name: num_perplexity_tokens
-    display_name: '# tokens'
-    description: Average number of tokens in the predicted output (for language modeling, the input too).
-  - name: num_bytes
-    display_name: '# bytes'
-    description: Average number of bytes in the predicted output (for language modeling, the input too).
-
-  - name: num_references
-    display_name: '# ref'
-    description: Number of references.
-  - name: num_train_trials
-    display_name: '# trials'
-    description: Number of trials, where in each trial we choose an independent, random set of training instances.
-  - name: estimated_num_tokens_cost
-    display_name: 'cost'
-    description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
-  - name: num_prompt_tokens
-    display_name: '# prompt tokens'
-    description: Number of tokens in the prompt.
-  - name: num_prompt_characters
-    display_name: '# prompt chars'
-    description: Number of characters in the prompt.
-  - name: num_completion_tokens
-    display_name: '# completion tokens'
-    description: Actual number of completion tokens (over all completions).
-  - name: num_output_tokens
-    display_name: '# output tokens'
-    description: Actual number of output tokens.
-  - name: max_num_output_tokens
-    display_name: 'Max output tokens'
-    description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
-  - name: num_requests
-    display_name: '# requests'
-    description: Number of distinct API requests.
-  - name: num_instances
-    display_name: '# eval'
-    description: Number of evaluation instances.
-  - name: num_train_instances
-    display_name: '# train'
-    description: Number of training instances (e.g., in-context examples).
-  - name: prompt_truncated
-    display_name: truncated
-    description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
-  - name: finish_reason_length
-    display_name: finish b/c length
-    description: Fraction of instances where the the output was terminated because of the max tokens limit.
-  - name: finish_reason_stop
-    display_name: finish b/c stop
-    description: Fraction of instances where the the output was terminated because of the stop sequences.
-  - name: finish_reason_endoftext
-    display_name: finish b/c endoftext
-    description: Fraction of instances where the the output was terminated because the end of text token was generated.
-  - name: finish_reason_unknown
-    display_name: finish b/c unknown
-    description: Fraction of instances where the the output was terminated for unknown reasons.
-  - name: num_completions
-    display_name: '# completions'
-    description: Number of completions.
-  - name: predicted_index
-    display_name: Predicted index
-    description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
-
-  # Accuracy metrics:
-  - name: exact_match
-    display_name: Exact match
-    short_display_name: EM
-    description: Fraction of instances that the predicted output matches a correct reference exactly.
-    lower_is_better: false
-
-  # Summarization metrics:
-  - name: annotation_call_center_summarization_faithfulness
-    display_name: Faithfulness
-    short_display_name: Faithfulness
-    description: Whether all the information expressed by the summary can be inferred from the source transcript.
-    lower_is_better: false
-  - name: annotation_call_center_summarization_relevance
-    display_name: Relevance
-    short_display_name: Relevance
-    description: Whether the summary includes only important information from the source.
-    lower_is_better: false
-  - name: annotation_call_center_summarization_coherence
-    display_name: Coherence
-    short_display_name: Coherence
-    description: Whether the summary organizes the relevant information into a well-structured summary.
-    lower_is_better: false
-
-  - name: annotation_call_center_summarization_pairwise_comparison_score
-    display_name: Pairwise
-    short_display_name: Pairwise
-    description: Whether the model's summary was preferred by the evaluator model
-    lower_is_better: false
-    
-  - name: annotation_call_center_summarization_key_points_recall_score
-    display_name: Recall
-    short_display_name: Recall
-    description: How many key items were recalled
-    lower_is_better: false
-
-  - name: annotation_helpdesk_call_center_summarization_score
-    display_name: Score
-    short_display_name: Score
-    description: Score
-    lower_is_better: false
-
-
-  - name: call_summarization_score
-    display_name: Score
-    short_display_name: Score
-    description: Score
-    lower_is_better: false
-
-
-############################################################
-perturbations: []
-
-############################################################
-metric_groups:
-  - name: summarization_metrics
-    display_name: Summarization
-    hide_win_rates: true
-    metrics:
-      - name: call_summarization_score
-        split: ${main_split}
-      # - name: annotation_helpdesk_call_center_summarization_score
-      #   split: ${main_split}
-      # - name: annotation_call_center_summarization_faithfulness
-      #   split: ${main_split}
-      # - name: annotation_call_center_summarization_relevance
-      #   split: ${main_split}
-      # - name: annotation_call_center_summarization_coherence
-      #   split: ${main_split}
-
-  - name: pairwise_comparison_metrics
-    display_name: Pairwise Comparison
-    hide_win_rates: true
-    metrics:
-      - name: annotation_call_center_summarization_pairwise_comparison_score
-        split: ${main_split}
-
-  - name: key_points_recall_metrics
-    display: Key Points Recall
-    hide_win_rates: true
-    metrics:
-      - name: annotation_call_center_summarization_key_points_recall_score
-        split: ${main_split}
-
-  - name: efficiency
-    display_name: Efficiency
-    metrics:
-    - name: inference_runtime
-      split: ${main_split}
-
-  - name: general_information
-    display_name: General information
-    hide_win_rates: true
-    metrics:
-    - name: num_instances
-      split: ${main_split}
-    - name: num_train_instances
-      split: ${main_split}
-    - name: prompt_truncated
-      split: ${main_split}
-    - name: num_prompt_tokens
-      split: ${main_split}
-    - name: num_output_tokens
-      split: ${main_split}
-
-############################################################
-
-run_groups:
-  - name: call_center_scenarios
-    display_name: Call Center Scenarios
-    description: Scenarios representating realistic tasks from the call center.
-    category: All scenarios
-    subgroups:
-      - helpdesk_call_summarization
-      # - call_center_summarization
-      # - call_center_summarization_real_call_transcripts
-      # - call_center_summarization_pairwise_comparison
-      # - call_center_summarization_key_points_recall
-
-  # - name: call_center_summarization
-  #   display_name: Summarization
-  #   description: summarization
-  #   metric_groups:
-  #     # - accuracy
-  #     - summarization_metrics
-  #     - efficiency
-  #     - general_information
-  #   environment:
-  #     main_split: test
-  #   taxonomy:
-  #     task: summarization
-  #     what: n/a
-  #     who: n/a
-  #     when: "?"
-  #     language: English
-
-  - name: helpdesk_call_summarization
-    display_name: Helpdesk Call summarization
-    description: Helpdesk Call summarization
-    metric_groups:
-      # - accuracy
-      - summarization_metrics
-      - efficiency
-      - general_information
-    environment:
-      main_split: test
-    taxonomy:
-      task: summarization
-      what: n/a
-      who: n/a
-      when: "?"
-      language: English
-
-  # - name: call_center_summarization_real_call_transcripts
-  #   display_name: Summarization (Real)
-  #   description: Summarization with real call transcripts
-  #   metric_groups:
-  #     # - accuracy
-  #     - summarization_metrics
-  #     - efficiency
-  #     - general_information
-  #   environment:
-  #     main_split: test
-  #   taxonomy:
-  #     task: summarization
-  #     what: n/a
-  #     who: n/a
-  #     when: "?"
-  #     language: English
-
-  # - name: call_center_summarization_pairwise_comparison
-  #   display_name: Summarization (Pairwise)
-  #   description: summarization
-  #   metric_groups:
-  #     # - accuracy
-  #     - pairwise_comparison_metrics
-  #     - efficiency
-  #     - general_information
-  #   environment:
-  #     main_split: test
-  #   taxonomy:
-  #     task: summarization
-  #     what: n/a
-  #     who: n/a
-  #     when: "?"
-  #     language: English
-
-  # - name: call_center_summarization_key_points_recall
-  #   display_name: Summarization (Key Points Recall)
-  #   description: summarization
-  #   metric_groups:
-  #     # - accuracy
-  #     - key_points_recall_metrics
-  #     - efficiency
-  #     - general_information
-  #   environment:
-  #     main_split: test
-  #   taxonomy:
-  #     task: summarization
-  #     what: n/a
-  #     who: n/a
-  #     when: "?"
-  #     language: English