diff --git a/helm-frontend/src/components/Landing/CallCenterLanding.tsx b/helm-frontend/src/components/Landing/CallCenterLanding.tsx deleted file mode 100644 index f00d5c8116..0000000000 --- a/helm-frontend/src/components/Landing/CallCenterLanding.tsx +++ /dev/null @@ -1,58 +0,0 @@ -import MiniLeaderboard from "@/components/MiniLeaderboard"; -import { Link } from "react-router-dom"; - -export default function CallCenterLanding() { - return ( -
-

- HELM Call Center Leaderboard -

-
-
-

- LLMs show great potential for applications for the call center, yet - there is a lack of domain-specific and ecologically-valid - evaluations in this domain. To address this, we introduce the{" "} - HELM Call Center leaderboard. - The HELM Call Center leaderboard evaluates leading LLMs on a - summarization task over a dataset of real helpdesk call transcripts - provided by Accenture. The quality of the summaries is evaluated - using LLM-as-judge with an ensemble of 3 models. We hope that this - leaderboard provides some initial insights into the potential of - LLMs in this domain. -

-

- This leaderboard was produced through research collaboration with{" "} - - Accenture - - , and was funded by the{" "} - - HAI Corporate Affiliate Program - - . -

-
-
- -
- - - -
-
-
-
- ); -} diff --git a/helm-frontend/src/components/Landing/CallTranscriptSummarizationLanding.tsx b/helm-frontend/src/components/Landing/CallTranscriptSummarizationLanding.tsx deleted file mode 100644 index f80eff31ff..0000000000 --- a/helm-frontend/src/components/Landing/CallTranscriptSummarizationLanding.tsx +++ /dev/null @@ -1,82 +0,0 @@ -import MiniLeaderboard from "@/components/MiniLeaderboard"; -import { Link } from "react-router-dom"; - -export default function CallTranscriptSummarizationLanding() { - return ( -
-

- HELM Call Transcript Summarization Leaderboard -

-
-
- -

- Large language models (LLMs) show great potential for call center - applications, yet there is a lack of domain-specific and - ecologically valid evaluations in this domain. To address this, we - introduce the{" "} - - HELM Call Transcript Summarization - {" "} - leaderboard, which evaluates leading LLMs on a summarization task - over a dataset of real call transcripts provided by Accenture. -

-

- This dataset consists of 162 transcribed calls to an internal - corporate IT helpdesk. The calls were transcribed using an automatic - speech recognition (ASR) model. Transcription errors were - deliberately left uncorrected to reflect the nature of real-life - transcripts. The transcripts were anonymized using a semi-automated - process with human verification. -

-

- To evaluate the LLMs, summaries of the transcripts were generated - using 17 LLMs. The quality of the generated summaries were then - evaluated using LLM-as-judge with an ensemble of 3 models. -

-

- As with all HELM leaderboards, this leaderboard provides full - transparency into all LLM requests and responses, and the results - are reproducible using the HELM open source framework. We hope that - this leaderboard offers initial insights into the potential of LLMs - for this task. -

-

- This leaderboard was produced through research collaboration with{" "} - - Accenture - - , and was funded by the{" "} - - HAI Corporate Affiliate Program - - . -

-
-
- -
- - - -
-
-
-
- ); -} diff --git a/helm-frontend/src/routes/Home.tsx b/helm-frontend/src/routes/Home.tsx index c33a391f64..144ac5258d 100644 --- a/helm-frontend/src/routes/Home.tsx +++ b/helm-frontend/src/routes/Home.tsx @@ -6,8 +6,6 @@ import ThaiExamLanding from "@/components/Landing/ThaiExamLanding"; import FinanceLanding from "@/components/Landing/FinanceLanding"; import HEIMLanding from "@/components/Landing/HEIMLanding"; import VHELMLanding from "@/components/VHELMLanding"; -import CallCenterLanding from "@/components/Landing/CallCenterLanding"; -import CallTranscriptSummarizationLanding from "@/components/Landing/CallTranscriptSummarizationLanding"; import CLEVALanding from "@/components/Landing/CLEVALanding"; import ToRRLanding from "@/components/Landing/ToRRLanding"; import HomeLanding from "@/components/Landing/HomeLanding"; @@ -48,10 +46,6 @@ export default function Home() { return ; } else if (window.PROJECT_ID === "finance") { return ; - } else if (window.PROJECT_ID === "call-center") { - return ; - } else if (window.PROJECT_ID === "call-transcript-summarization") { - return ; } else if (window.PROJECT_ID === "cleva") { return ; } else if (window.PROJECT_ID === "torr") { diff --git a/src/helm/benchmark/annotation/call_center_annotator.py b/src/helm/benchmark/annotation/call_center_annotator.py deleted file mode 100644 index 7176de4ffd..0000000000 --- a/src/helm/benchmark/annotation/call_center_annotator.py +++ /dev/null @@ -1,258 +0,0 @@ -import json -from json.decoder import JSONDecodeError -import textwrap -from typing import Any - -from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.annotation.annotator import Annotator -from helm.clients.auto_client import AutoClient -from helm.common.hierarchical_logger import hlog -from helm.common.request import Request - - -class CallCenterSummarizationAnnotator(Annotator): - """Annotator for call center summarization.""" - - name = "call_center_summarization" - - PROMPT_TEMPLATE = """\ - Score the summary by scoring it on a five-point scale between 1 and 5 using three criteria: Faithfulness, Relevance, and Coherence. - - ### Criteria - Faithfulness: Can all the information expressed by the summary can be inferred from the source? (1 = not at all, 5 = very much) - Relevance: To what extent the summary include only important information from the source? (1 = not at all, 5 = very much) - Coherence: Does the summary organize the relevant information into a well-structured summary? (1 = not at all, 5 = very much) - - ### Call Transcript - {{CALL_TRANSCRIPT}} - - ### Summary - {{SUMMARY}} - - ### Task - Respond with only a raw JSON object in the following format, without using Markdown formatting: - - {"faithfulness": , "relevance": , "coherence": } - """ # noqa: E501 - - CRITERIA = [ - "faithfulness", - "relevance", - "coherence", - ] - - def __init__(self, auto_client: AutoClient): - super().__init__() - self._auto_client = auto_client - - def annotate(self, request_state: RequestState) -> Any: - assert request_state.result - assert len(request_state.result.completions) == 1 - call_transcript = request_state.instance.input.text - summary = request_state.result.completions[0].text.strip() - if not summary.strip(): - hlog("Returning 0 scores due to empty response") - return {"faithfulness": 0, "relevance": 0, "coherence": 0} - annotator_prompt = ( - textwrap.dedent(CallCenterSummarizationAnnotator.PROMPT_TEMPLATE) - .replace("{{CALL_TRANSCRIPT}}", call_transcript) - .replace("{{SUMMARY}}", summary) - ) - annotator_request = Request( - model="openai/gpt-4o-mini-2024-07-18", - model_deployment="openai/gpt-4o-mini-2024-07-18", - prompt=annotator_prompt, - temperature=0.0, - max_tokens=256, - ) - annotator_response = self._auto_client.make_request(annotator_request) - if not annotator_response.success: - raise Exception(f"Annotation request failed: {annotator_response.error}") - assert len(annotator_response.completions) == 1 - annotator_response_text = annotator_response.completions[0].text - # OpenAI models like to surround JSON objects with ```json and ``` Markdown formatting. - # This strips everything outside the outermost {} brackets. - json_start_index = annotator_response_text.find("{") - json_end_index = annotator_response_text.rfind("}") - if json_start_index < 0 or json_end_index < 0: - raise Exception(f"Malformed annotator response: {annotator_response_text}") - annotator_response_json = annotator_response_text[json_start_index : json_end_index + 1] - try: - annotator_response_parsed = json.loads(annotator_response_json) - except JSONDecodeError: - raise Exception(f"Malformed annotator response: {annotator_response_text}") - for expected_key in CallCenterSummarizationAnnotator.CRITERIA: - if expected_key not in annotator_response_parsed: - raise Exception(f"Malformed annotator response: {annotator_response_text}") - return annotator_response_parsed - - -class CallCenterSummarizationPairwiseComparisonAnnotator(Annotator): - """Annotator for call center summarization with pairwise comparison.""" - - name = "call_center_summarization_pairwise_comparison" - - PROMPT_TEMPLATE = """\ - Given a call transcript and two different summaries of the call transcript, select your preferred summary, which can be subjective, considering the criteria below. Also provide a one-sentence reasoning for your selection. - - ### Criteria - Faithfulness: Can all the information expressed by the summary can be inferred from the source? - Relevance: To what extent the summary include only important information from the source? - Coherence: Does the summary organize the relevant information into a well-structured summary? - - ### Call Transcript - {{CALL_TRANSCRIPT}} - - ### Summary A - {{SUMMARY_A}} - - ### Summary B - {{SUMMARY_B}} - - ### Task - Output only a JSON object with the following format: - - {"reasoning": "Reasoning", "selected": "A" or "B"} - """ # noqa: E501 - - def __init__(self, auto_client: AutoClient): - super().__init__() - self._auto_client = auto_client - - def annotate(self, request_state: RequestState) -> Any: - assert request_state.result - assert len(request_state.result.completions) == 1 - call_transcript = request_state.instance.input.text - summary = request_state.result.completions[0].text.strip() - assert len(request_state.instance.all_correct_references) == 1 - reference_summary = request_state.instance.all_correct_references[0].output.text - if not summary.strip(): - hlog("Returning 0 scores due to empty response") - return {"faithfulness": 0, "relevance": 0, "coherence": 0} - assert request_state.instance.id is not None - instance_id = int(request_state.instance.id[2:]) - if instance_id % 2: - reference_option = "A" - summary_a = reference_summary - summary_b = summary - else: - reference_option = "B" - summary_a = summary - summary_b = reference_summary - annotator_prompt = ( - textwrap.dedent(CallCenterSummarizationPairwiseComparisonAnnotator.PROMPT_TEMPLATE) - .replace("{{CALL_TRANSCRIPT}}", call_transcript) - .replace("{{SUMMARY_B}}", summary_a) - .replace("{{SUMMARY_A}}", summary_b) - ) - annotator_request = Request( - model="openai/gpt-4o-2024-08-06", - model_deployment="openai/gpt-4o-2024-08-06", - prompt=annotator_prompt, - temperature=0.0, - max_tokens=256, - ) - annotator_response = self._auto_client.make_request(annotator_request) - if not annotator_response.success: - raise Exception(f"Annotation request failed: {annotator_response.error}") - assert len(annotator_response.completions) == 1 - annotator_response_text = annotator_response.completions[0].text - # OpenAI models like to surround JSON objects with ```json and ``` Markdown formatting. - # This strips everything outside the outermost {} brackets. - json_start_index = annotator_response_text.find("{") - json_end_index = annotator_response_text.rfind("}") - if json_start_index < 0 or json_end_index < 0: - raise Exception(f"Malformed annotator response: {annotator_response_text}") - annotator_response_json = annotator_response_text[json_start_index : json_end_index + 1] - try: - annotator_response_parsed = json.loads(annotator_response_json) - except JSONDecodeError: - raise Exception(f"Malformed annotator response: {annotator_response_text}") - for expected_key in ["reasoning", "selected"]: - if expected_key not in annotator_response_parsed: - raise Exception(f"Malformed annotator response: {annotator_response_text}") - score = 0.0 - selected = annotator_response_parsed["selected"].strip() - if selected != "A" and selected != "B": - raise Exception(f"Malformed annotator response: {annotator_response_text}") - if selected == reference_option: - score = 0.0 - else: - score = 1.0 - return { - "reasoning": annotator_response_parsed["reasoning"], - "selected": selected, - "reference_option": reference_option, - "score": score, - } - - -class CallCenterSummarizationKeyPointsRecallAnnotator(Annotator): - """Annotator for call center summarization with key point recall.""" - - name = "call_center_summarization_key_points_recall" - - PROMPT_TEMPLATE = """\ - Given a call transcript, a list of key points and a summary, determine which key points are included in the summary. - - ### Call Transcript - {{CALL_TRANSCRIPT}} - - ### Key Points - {{KEY_POINTS}} - - ### Summary - {{SUMMARY}} - - ### Task - Output only a JSON array of booleans, where each boolean indicates if the corresponding key point was included in the summary. - """ # noqa: E501 - - def __init__(self, auto_client: AutoClient): - super().__init__() - self._auto_client = auto_client - - def annotate(self, request_state: RequestState) -> Any: - assert request_state.result - assert len(request_state.result.completions) == 1 - call_transcript = request_state.instance.input.text - summary = request_state.result.completions[0].text.strip() - key_points = "\n".join( - [f"- {reference.output.text}" for reference in request_state.instance.all_correct_references] - ) - if not summary.strip(): - hlog("Returning 0 scores due to empty response") - return {"faithfulness": 0, "relevance": 0, "coherence": 0} - annotator_prompt = ( - textwrap.dedent(CallCenterSummarizationKeyPointsRecallAnnotator.PROMPT_TEMPLATE) - .replace("{{CALL_TRANSCRIPT}}", call_transcript) - .replace("{{KEY_POINTS}}", key_points) - .replace("{{SUMMARY}}", summary) - ) - annotator_request = Request( - model="openai/gpt-4o-2024-08-06", - model_deployment="openai/gpt-4o-2024-08-06", - prompt=annotator_prompt, - temperature=0.0, - max_tokens=256, - ) - annotator_response = self._auto_client.make_request(annotator_request) - if not annotator_response.success: - raise Exception(f"Annotation request failed: {annotator_response.error}") - assert len(annotator_response.completions) == 1 - annotator_response_text = annotator_response.completions[0].text - # OpenAI models like to surround JSON objects with ```json and ``` Markdown formatting. - # This strips everything outside the outermost [] brackets. - json_start_index = annotator_response_text.find("[") - json_end_index = annotator_response_text.rfind("]") - if json_start_index < 0 or json_end_index < 0: - raise Exception(f"Malformed annotator response: {annotator_response_text}") - annotator_response_json = annotator_response_text[json_start_index : json_end_index + 1] - try: - annotator_response_parsed = json.loads(annotator_response_json) - except JSONDecodeError: - raise Exception(f"Malformed annotator response: {annotator_response_text}") - if not len(annotator_response_parsed): - raise Exception(f"Malformed annotator response: {annotator_response_text}") - score = sum([1.0 if elem else 0.0 for elem in annotator_response_parsed]) / len(annotator_response_parsed) - return {"key_points_found": json.dumps(annotator_response_parsed), "score": score} diff --git a/src/helm/benchmark/metrics/helpdesk_call_summarization_metrics.py b/src/helm/benchmark/metrics/helpdesk_call_summarization_metrics.py deleted file mode 100644 index 21020b15b4..0000000000 --- a/src/helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import Any, Dict, List - -from helm.benchmark.adaptation.adapter_spec import AdapterSpec -from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.metrics.metric import Metric, MetricMetadata -from helm.benchmark.metrics.metric_name import MetricName -from helm.benchmark.metrics.metric_service import MetricService -from helm.benchmark.metrics.statistic import Stat - - -class HelpdeskCallSummarizationMetric(Metric): - """Score metrics for helpdesk call summarization.""" - - def evaluate_generation( - self, - adapter_spec: AdapterSpec, - request_state: RequestState, - metric_service: MetricService, - eval_cache_path: str, - ) -> List[Stat]: - assert request_state.annotations - annotations: Dict[str, Any] = request_state.annotations["helpdesk_call_center_summarization"] - scores: List[int] = [] - for annotation_key, annotation_value in annotations.items(): - if annotation_key.endswith("_score") and annotation_value is not None: - scores.append(annotation_value) - if not scores: - raise ValueError( - "Could not compute score in HelpdeskCallSummarizationMetric because all annotators failed." - ) - score = sum(scores) / len(scores) - # normalize score - score = (score - 1) / 9 - return [ - Stat(MetricName("call_summarization_score")).add(score), - ] - - def get_metadata(self) -> List[MetricMetadata]: - return [ - MetricMetadata( - name="call_summarization_score", - display_name="Score", - short_display_name="Score", - description="Score", - lower_is_better=False, - group="summarization_metrics", - ), - ] diff --git a/src/helm/benchmark/run_specs/call_center_run_specs.py b/src/helm/benchmark/run_specs/call_center_run_specs.py deleted file mode 100644 index d1eaded8b3..0000000000 --- a/src/helm/benchmark/run_specs/call_center_run_specs.py +++ /dev/null @@ -1,201 +0,0 @@ -"""Run specs for experiments only. - -These run specs are not intended for use with public leaderboards.""" - -from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec -from helm.benchmark.annotation.annotator import AnnotatorSpec -from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs -from helm.benchmark.metrics.metric import MetricSpec -from helm.benchmark.run_spec import RunSpec, run_spec_function -from helm.benchmark.scenarios.scenario import ScenarioSpec - - -@run_spec_function("call_center_summarization") -def get_call_center_summarization_spec(subset: str = "summarization") -> RunSpec: - from helm.benchmark.annotation.call_center_annotator import CallCenterSummarizationAnnotator - - scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario", - args={"subset": subset}, - ) - - instructions = "Summarize the call transcript in under 10 sentences." - - adapter_spec = AdapterSpec( - method=ADAPT_GENERATION, - instructions=instructions, - input_prefix="### Call Transcript\n", - input_suffix="", - output_prefix="", - output_suffix="", - max_train_instances=0, - temperature=0.0, - max_tokens=512, - num_outputs=1, - ) - - annotator_specs = annotator_specs = [ - AnnotatorSpec(class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator") - ] - annotation_metric_specs = [ - MetricSpec( - class_name="helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric", - args={ - "annotator_name": CallCenterSummarizationAnnotator.name, - "key": criterion, - "min_score": 1, - "max_score": 5, - }, - ) - for criterion in CallCenterSummarizationAnnotator.CRITERIA - ] - - metric_specs = get_basic_metric_specs([]) + annotation_metric_specs - - group = "call_center_summarization" if subset == "summarization" else f"call_center_summarization_{subset}" - - return RunSpec( - name="call_center_summarization", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=metric_specs, - annotators=annotator_specs, - groups=[group], - ) - - -@run_spec_function("call_center_summarization_pairwise_comparison") -def get_call_center_summarization_pairwise_comparison_spec() -> RunSpec: - scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario", - ) - - instructions = "Summarize the call transcript in under 10 sentences." - - adapter_spec = AdapterSpec( - method=ADAPT_GENERATION, - instructions=instructions, - input_prefix="### Call Transcript\n", - input_suffix="", - output_prefix="", - output_suffix="", - max_train_instances=0, - temperature=0.0, - max_tokens=512, - num_outputs=1, - ) - - annotator_specs = annotator_specs = [ - AnnotatorSpec( - class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator" # noqa: E501 - ) - ] - - metric_specs = get_basic_metric_specs([]) + [ - MetricSpec( - class_name="helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric", - args={"annotator_name": "call_center_summarization_pairwise_comparison", "key": "score"}, - ) - ] - - return RunSpec( - name="call_center_summarization_pairwise_comparison", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=metric_specs, - annotators=annotator_specs, - groups=["call_center_summarization_pairwise_comparison"], - ) - - -@run_spec_function("call_center_summarization_key_points_recall") -def get_call_center_summarization_key_points_recall_spec() -> RunSpec: - scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationKeyPointsRecallScenario", - ) - - instructions = "Summarize the call transcript in under 10 sentences." - - adapter_spec = AdapterSpec( - method=ADAPT_GENERATION, - instructions=instructions, - input_prefix="### Call Transcript\n", - input_suffix="", - output_prefix="", - output_suffix="", - max_train_instances=0, - temperature=0.0, - max_tokens=512, - num_outputs=1, - ) - - annotator_specs = annotator_specs = [ - AnnotatorSpec( - class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationKeyPointsRecallAnnotator" - ) - ] - - metric_specs = get_basic_metric_specs([]) + [ - MetricSpec( - class_name="helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric", - args={"annotator_name": "call_center_summarization_key_points_recall", "key": "score"}, - ) - ] - - return RunSpec( - name="call_center_summarization_key_points_recall", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=metric_specs, - annotators=annotator_specs, - groups=["call_center_summarization_key_points_recall"], - ) - - -@run_spec_function("helpdesk_call_summarization") -def get_helpdesk_call_summarization_spec() -> RunSpec: - scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.helpdesk_call_summarization_scenario.HelpdeskCallSummarizationScenario", - ) - annotator_specs = annotator_specs = [ - AnnotatorSpec( - class_name="helm.benchmark.annotation.helpdesk_call_summarization_annotator.HelpdeskCallSummarizationAnnotator" # noqa: E501 - ) - ] - - instructions = "The following is a call transcript of a call between a compnay's employee and the company's IT helpdesk. Summarize the call transcript in under 200 words." # noqa: E501 - - adapter_spec = AdapterSpec( - method=ADAPT_GENERATION, - instructions=instructions, - input_prefix="### Call Transcript\n", - input_suffix="", - output_prefix="", - output_suffix="", - max_train_instances=0, - temperature=0.0, - max_tokens=512, - num_outputs=1, - ) - - # annotator_specs = annotator_specs = [ - # AnnotatorSpec(class_name="helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator") - # ] - annotation_metric_specs = [ - MetricSpec( - class_name="helm.benchmark.metrics.helpdesk_call_summarization_metrics.HelpdeskCallSummarizationMetric", - ), - ] - - metric_specs = get_basic_metric_specs([]) + annotation_metric_specs - - group = "helpdesk_call_summarization" - - return RunSpec( - name="helpdesk_call_summarization", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=metric_specs, - annotators=annotator_specs, - groups=[group], - ) diff --git a/src/helm/benchmark/scenarios/call_center_scenario.py b/src/helm/benchmark/scenarios/call_center_scenario.py deleted file mode 100644 index 724bf8049f..0000000000 --- a/src/helm/benchmark/scenarios/call_center_scenario.py +++ /dev/null @@ -1,84 +0,0 @@ -import datasets -import os -from typing import List - -from helm.benchmark.scenarios.scenario import ( - CORRECT_TAG, - Output, - Reference, - Scenario, - Instance, - TEST_SPLIT, - Input, -) -from helm.common.general import ensure_directory_exists - - -class CallCenterSummarizationScenario(Scenario): - """Call center summarization.""" - - name = "call_center_summarization" - description = "Call center summarization." - tags = ["call_center"] - - def __init__(self, subset: str): - super().__init__() - self.subset = subset - - def get_instances(self, output_path: str) -> List[Instance]: - cache_dir = os.path.join(output_path, "data") - ensure_directory_exists(cache_dir) - dataset = datasets.load_dataset("yifanmai/call-center", self.subset, split="test", cache_dir=cache_dir) - instances: List[Instance] = [] - for row in dataset: - input = Input(text=row["transcript"]) - instance = Instance(input=input, references=[], split=TEST_SPLIT) - instances.append(instance) - return instances - - -class CallCenterSummarizationPairwiseComparisonScenario(Scenario): - """Call center summarization.""" - - name = "call_center_summarization_pairwise_comparison" - description = "Call center summarization." - tags = ["call_center"] - - def get_instances(self, output_path: str) -> List[Instance]: - cache_dir = os.path.join(output_path, "data") - ensure_directory_exists(cache_dir) - dataset = datasets.load_dataset( - "yifanmai/call-center", "summarization_with_annotations", split="test", cache_dir=cache_dir - ) - instances: List[Instance] = [] - for row in dataset: - input = Input(text=row["transcript"]) - reference = Reference(output=Output(text=row["gpt-4o-mini-2024-07-18_summary"]), tags=[CORRECT_TAG]) - instance = Instance(input=input, references=[reference], split=TEST_SPLIT) - instances.append(instance) - return instances - - -class CallCenterSummarizationKeyPointsRecallScenario(Scenario): - """Call center summarization.""" - - name = "call_center_summarization_key_points_recall" - description = "Call center summarization." - tags = ["call_center"] - - def get_instances(self, output_path: str) -> List[Instance]: - cache_dir = os.path.join(output_path, "data") - ensure_directory_exists(cache_dir) - dataset = datasets.load_dataset( - "yifanmai/call-center", "summarization_with_annotations", split="test", cache_dir=cache_dir - ) - instances: List[Instance] = [] - for row in dataset: - input = Input(text=row["transcript"]) - references = [ - Reference(output=Output(text=key_point), tags=[CORRECT_TAG]) - for key_point in row["gpt-4o-mini-2024-07-18_key_points"] - ] - instance = Instance(input=input, references=references, split=TEST_SPLIT) - instances.append(instance) - return instances diff --git a/src/helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py b/src/helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py deleted file mode 100644 index 57409b8921..0000000000 --- a/src/helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +++ /dev/null @@ -1,50 +0,0 @@ -import csv -import os -from typing import List - -from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo -from helm.benchmark.scenarios.scenario import ( - Scenario, - Instance, - TEST_SPLIT, - Input, - ScenarioMetadata, -) - - -_DATA_DIRRECTORY_PATH = "restricted/helpdesk_call_summarization/HELM Sample Transcripts_20241221_0045" - - -class HelpdeskCallSummarizationScenario(Scenario): - """Helpdesk call summarization.""" - - name = "helpdesk_call_summarization" - description = "Helpdesk call summarization." - tags = ["helpdesk_call_center"] - - def get_instances(self, output_path: str) -> List[Instance]: - instances: List[Instance] = [] - for file_name in os.listdir(_DATA_DIRRECTORY_PATH): - if not file_name.endswith(".csv") or not file_name.startswith("Call1-"): - continue - file_path = os.path.join(_DATA_DIRRECTORY_PATH, file_name) - with open(file_path) as f: - csv_reader = csv.reader(f) - prompt_lines = [f"{row[0]}: {row[4]}" for row in csv_reader] - prompt = "\n".join(prompt_lines) - instance_id = file_name.removeprefix("Call1-").removesuffix(".csv") - input = Input(text=prompt) - instance = Instance(id=instance_id, input=input, references=[], split=TEST_SPLIT) - instances.append(instance) - return instances - - def get_metadata(self) -> ScenarioMetadata: - return ScenarioMetadata( - name="helpdesk_call_summarization", - display_name="Helpdesk Call summarization", - short_display_name=None, - description="Helpdesk Call summarization", - taxonomy=TaxonomyInfo(task="summarization", what="n/a", when="?", who="n/a", language="English"), - main_metric="unknown", - main_split="test", - ) diff --git a/src/helm/benchmark/static/schema_call_center.yaml b/src/helm/benchmark/static/schema_call_center.yaml deleted file mode 100644 index 7f222470b5..0000000000 --- a/src/helm/benchmark/static/schema_call_center.yaml +++ /dev/null @@ -1,269 +0,0 @@ ---- -############################################################ -metrics: - # Infrastructure metrics: - - name: num_perplexity_tokens - display_name: '# tokens' - description: Average number of tokens in the predicted output (for language modeling, the input too). - - name: num_bytes - display_name: '# bytes' - description: Average number of bytes in the predicted output (for language modeling, the input too). - - - name: num_references - display_name: '# ref' - description: Number of references. - - name: num_train_trials - display_name: '# trials' - description: Number of trials, where in each trial we choose an independent, random set of training instances. - - name: estimated_num_tokens_cost - display_name: 'cost' - description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request. - - name: num_prompt_tokens - display_name: '# prompt tokens' - description: Number of tokens in the prompt. - - name: num_prompt_characters - display_name: '# prompt chars' - description: Number of characters in the prompt. - - name: num_completion_tokens - display_name: '# completion tokens' - description: Actual number of completion tokens (over all completions). - - name: num_output_tokens - display_name: '# output tokens' - description: Actual number of output tokens. - - name: max_num_output_tokens - display_name: 'Max output tokens' - description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences). - - name: num_requests - display_name: '# requests' - description: Number of distinct API requests. - - name: num_instances - display_name: '# eval' - description: Number of evaluation instances. - - name: num_train_instances - display_name: '# train' - description: Number of training instances (e.g., in-context examples). - - name: prompt_truncated - display_name: truncated - description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples). - - name: finish_reason_length - display_name: finish b/c length - description: Fraction of instances where the the output was terminated because of the max tokens limit. - - name: finish_reason_stop - display_name: finish b/c stop - description: Fraction of instances where the the output was terminated because of the stop sequences. - - name: finish_reason_endoftext - display_name: finish b/c endoftext - description: Fraction of instances where the the output was terminated because the end of text token was generated. - - name: finish_reason_unknown - display_name: finish b/c unknown - description: Fraction of instances where the the output was terminated for unknown reasons. - - name: num_completions - display_name: '# completions' - description: Number of completions. - - name: predicted_index - display_name: Predicted index - description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice). - - # Accuracy metrics: - - name: exact_match - display_name: Exact match - short_display_name: EM - description: Fraction of instances that the predicted output matches a correct reference exactly. - lower_is_better: false - - # Summarization metrics: - - name: annotation_call_center_summarization_faithfulness - display_name: Faithfulness - short_display_name: Faithfulness - description: Whether all the information expressed by the summary can be inferred from the source transcript. - lower_is_better: false - - name: annotation_call_center_summarization_relevance - display_name: Relevance - short_display_name: Relevance - description: Whether the summary includes only important information from the source. - lower_is_better: false - - name: annotation_call_center_summarization_coherence - display_name: Coherence - short_display_name: Coherence - description: Whether the summary organizes the relevant information into a well-structured summary. - lower_is_better: false - - - name: annotation_call_center_summarization_pairwise_comparison_score - display_name: Pairwise - short_display_name: Pairwise - description: Whether the model's summary was preferred by the evaluator model - lower_is_better: false - - - name: annotation_call_center_summarization_key_points_recall_score - display_name: Recall - short_display_name: Recall - description: How many key items were recalled - lower_is_better: false - - - name: annotation_helpdesk_call_center_summarization_score - display_name: Score - short_display_name: Score - description: Score - lower_is_better: false - - - - name: call_summarization_score - display_name: Score - short_display_name: Score - description: Score - lower_is_better: false - - -############################################################ -perturbations: [] - -############################################################ -metric_groups: - - name: summarization_metrics - display_name: Summarization - hide_win_rates: true - metrics: - - name: call_summarization_score - split: ${main_split} - # - name: annotation_helpdesk_call_center_summarization_score - # split: ${main_split} - # - name: annotation_call_center_summarization_faithfulness - # split: ${main_split} - # - name: annotation_call_center_summarization_relevance - # split: ${main_split} - # - name: annotation_call_center_summarization_coherence - # split: ${main_split} - - - name: pairwise_comparison_metrics - display_name: Pairwise Comparison - hide_win_rates: true - metrics: - - name: annotation_call_center_summarization_pairwise_comparison_score - split: ${main_split} - - - name: key_points_recall_metrics - display: Key Points Recall - hide_win_rates: true - metrics: - - name: annotation_call_center_summarization_key_points_recall_score - split: ${main_split} - - - name: efficiency - display_name: Efficiency - metrics: - - name: inference_runtime - split: ${main_split} - - - name: general_information - display_name: General information - hide_win_rates: true - metrics: - - name: num_instances - split: ${main_split} - - name: num_train_instances - split: ${main_split} - - name: prompt_truncated - split: ${main_split} - - name: num_prompt_tokens - split: ${main_split} - - name: num_output_tokens - split: ${main_split} - -############################################################ - -run_groups: - - name: call_center_scenarios - display_name: Call Center Scenarios - description: Scenarios representating realistic tasks from the call center. - category: All scenarios - subgroups: - - helpdesk_call_summarization - # - call_center_summarization - # - call_center_summarization_real_call_transcripts - # - call_center_summarization_pairwise_comparison - # - call_center_summarization_key_points_recall - - # - name: call_center_summarization - # display_name: Summarization - # description: summarization - # metric_groups: - # # - accuracy - # - summarization_metrics - # - efficiency - # - general_information - # environment: - # main_split: test - # taxonomy: - # task: summarization - # what: n/a - # who: n/a - # when: "?" - # language: English - - - name: helpdesk_call_summarization - display_name: Helpdesk Call summarization - description: Helpdesk Call summarization - metric_groups: - # - accuracy - - summarization_metrics - - efficiency - - general_information - environment: - main_split: test - taxonomy: - task: summarization - what: n/a - who: n/a - when: "?" - language: English - - # - name: call_center_summarization_real_call_transcripts - # display_name: Summarization (Real) - # description: Summarization with real call transcripts - # metric_groups: - # # - accuracy - # - summarization_metrics - # - efficiency - # - general_information - # environment: - # main_split: test - # taxonomy: - # task: summarization - # what: n/a - # who: n/a - # when: "?" - # language: English - - # - name: call_center_summarization_pairwise_comparison - # display_name: Summarization (Pairwise) - # description: summarization - # metric_groups: - # # - accuracy - # - pairwise_comparison_metrics - # - efficiency - # - general_information - # environment: - # main_split: test - # taxonomy: - # task: summarization - # what: n/a - # who: n/a - # when: "?" - # language: English - - # - name: call_center_summarization_key_points_recall - # display_name: Summarization (Key Points Recall) - # description: summarization - # metric_groups: - # # - accuracy - # - key_points_recall_metrics - # - efficiency - # - general_information - # environment: - # main_split: test - # taxonomy: - # task: summarization - # what: n/a - # who: n/a - # when: "?" - # language: English