Skip to content

Commit 35c9232

Browse files
committed
add split creation script and fix typos
1 parent 793d703 commit 35c9232

File tree

12 files changed

+219
-43
lines changed

12 files changed

+219
-43
lines changed

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
1-
# Label Shift Estimation for Named Entity Recognition using Familarity
1+
# Label Shift Estimation for Named Entity Recognition using Familiarity
22

33
**Our paper got accepted to NAACL 2025 🎉 See our [paper](https://arxiv.org/abs/2412.10121) and find the datasets on the [huggingface hub]()!**
44

5-
This repository computes the label shift for zero-shot NER settings using the Familarity metric. The metric uses semantic similarity between the sets of label seen during training and used for evaluation to indicate how "familiar" the trained model will be with the evaluation labels.
5+
This repository computes the label shift for zero-shot NER settings using the Familiarity metric. The metric uses semantic similarity between the sets of label seen during training and used for evaluation to indicate how "familiar" the trained model will be with the evaluation labels.
66

77
## Installation
88
```python
9-
conda create -n familarity python=3.11
10-
conda activate familarity
9+
conda create -n familiarity python=3.11
10+
conda activate familiarity
1111
pip install -e .
1212
```
1313

1414
## Usage
1515
```python
1616
import numpy as np
17-
from familarity import compute_metric
17+
from familiarity import compute_metric
1818
train_labels_set = ["person", "location", "building", "eagle", "restaurant", "util"]
1919
train_probs = [0.4, 0.1, 0.1, 0.1, 0.1, 0.2]
2020
train_labels = np.random.choice(train_labels_set, size=30000, p=train_probs).tolist()
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
import copy
2+
import json
3+
from typing import Dict, List
4+
5+
import numpy as np
6+
import pandas as pd
7+
import torch
8+
from datasets import Dataset, DatasetDict
9+
from sentence_transformers import SentenceTransformer
10+
from torch.nn.functional import cosine_similarity
11+
from tqdm import tqdm
12+
13+
14+
def create_splits_for_hf_hub(train_dataset: str):
15+
# Dataset format should be a list of dictionaries, where each dictionary represents a data point.
16+
path_to_train_data = f"path/to/train/{train_dataset}.json"
17+
with open(path_to_train_data, "r") as f:
18+
data = json.load(f)
19+
20+
for filter_by in ["entropy", "max"]:
21+
dataset_dict = DatasetDict()
22+
for setting in ["easy", "medium", "hard"]:
23+
new_split = create_splits(
24+
data,
25+
train_dataset,
26+
filter_by=filter_by,
27+
setting=setting,
28+
)
29+
30+
hf_format = [convert_to_hf_format(data_point) for data_point in new_split]
31+
32+
ds = Dataset.from_pandas(pd.DataFrame(data=hf_format))
33+
dataset_dict[setting] = ds
34+
35+
dataset_dict.push_to_hub(f"{train_dataset}_{filter_by}_splits")
36+
37+
38+
def convert_to_hf_format(data_point):
39+
tags = ["O"] * len(data_point["tokenized_text"])
40+
spans = []
41+
for ent in data_point["ner"]:
42+
start, end, label = ent[0], ent[1], ent[2]
43+
spans.append({"start": start, "end": end, "label": label})
44+
if start == end:
45+
tags[start] = "B-" + label
46+
else:
47+
try:
48+
tags[start] = "B-" + label
49+
tags[start + 1 : end + 1] = ["I-" + label] * (end - start)
50+
except:
51+
pass
52+
return {"tokens": data_point["tokenized_text"], "ner_tags": tags, "spans": spans}
53+
54+
55+
def create_splits(
56+
dataset: List[Dict],
57+
dataset_name: str, # The name of the dataset for which the splits should be created
58+
filter_by: str = "entropy",
59+
setting: str = "medium",
60+
):
61+
try:
62+
df = pd.read_pickle("new_splits.pkl")
63+
except:
64+
raise FileNotFoundError("Please run the compute_new_splits function first to generate the data.")
65+
df = df[(df["train_dataset"] == dataset_name)]
66+
67+
selected_entity_types = []
68+
for benchmark_name in df["eval_dataset"].unique():
69+
_df = df[(df["eval_dataset"] == benchmark_name)].copy()
70+
71+
# The thresholds are dataset specific and may need to be adjusted to account for dataset with different characteristics
72+
if filter_by == "entropy":
73+
low_threshold = df[filter_by].quantile(0.01)
74+
high_threshold = df[filter_by].quantile(0.95)
75+
elif filter_by == "max":
76+
low_threshold = df[filter_by].quantile(0.05)
77+
high_threshold = df[filter_by].quantile(0.99)
78+
79+
medium_lower_threshold = df[filter_by].quantile(0.495)
80+
medium_upper_threshold = df[filter_by].quantile(0.505)
81+
82+
# Define conditions and choices for categorization
83+
conditions = [
84+
_df[filter_by] <= low_threshold, # Bottom
85+
_df[filter_by].between(medium_lower_threshold, medium_upper_threshold), # Middle
86+
_df[filter_by] >= high_threshold, # Top
87+
]
88+
choices = ["easy", "medium", "hard"] if filter_by == "entropy" else ["hard", "medium", "easy"]
89+
90+
# Use np.select to create the new column based on the conditions
91+
_df["difficulty"] = np.select(conditions, choices, default="not relevant")
92+
93+
selected_entity_types.extend(_df[_df["difficulty"] == setting]["entity"].tolist())
94+
95+
new_dataset = []
96+
for dp in tqdm(dataset):
97+
matched_entities = [x for x in dp["ner"] if x[-1].lower().strip() in selected_entity_types]
98+
if matched_entities:
99+
new_np = copy.deepcopy(dp)
100+
new_np["ner"] = matched_entities
101+
new_dataset.append(new_np)
102+
103+
return new_dataset
104+
105+
106+
def compute_new_splits():
107+
# TODO: you need to load the data into two variables: 'benchmarks' and 'training_datasets'.
108+
# 'benchmarks' should be a dictionary with the benchmark names as keys and the (list of distinct) entity types as values.
109+
# 'training_datasets' should be a dictionary with the training dataset names as keys and the (list of distinct) entity types as values.
110+
# We process multiple benchmarks and training datasets in this example, but you can adjust the code to fit your needs.
111+
# Further, we stick with the following dataset layout: list of dictionaries, where each dictionary represents a data point.
112+
# For example: [{'tokenized_text': [...], 'ner': [(start, end, entity_type), ...]}, ...]
113+
114+
benchmarks = {}
115+
for benchmark_name in ['path/to/eval/dataset1.json', 'path/to/eval/dataset2.json']:
116+
# Data loading logic here, e.g.:
117+
# tokens, entity_types = load_eval_dataset(benchmark_name)
118+
# benchmarks[benchmark_name] = list(entity_types)
119+
pass
120+
121+
training_datasets = {}
122+
for train_dataset_name in ['path/to/train/dataset1.json', 'path/to/train/dataset2.json']:
123+
# Data loading logic here, e.g.:
124+
# tokens, entity_types = load_train_dataset(train_dataset_name)
125+
# training_datasets[train_dataset_name] = list(entity_types)
126+
pass
127+
128+
batch_size = 256
129+
model = SentenceTransformer("all-mpnet-base-v2").to("cuda")
130+
eval_encodings = {}
131+
for benchmark_name, entity_types in benchmarks.items():
132+
embeddings = model.encode(entity_types, convert_to_tensor=True, device="cuda")
133+
eval_encodings[benchmark_name] = embeddings
134+
135+
results = {}
136+
for dataset_name, entity_types in training_datasets.items():
137+
for i in tqdm(range(0, len(entity_types), batch_size)):
138+
dataset_name = dataset_name.split(".")[0]
139+
batch = entity_types[i : i + batch_size]
140+
embeddings = model.encode(batch, convert_to_tensor=True, device="cuda")
141+
for benchmark_name, eval_embeddings in eval_encodings.items():
142+
similarities = torch.clamp(
143+
cosine_similarity(
144+
embeddings.unsqueeze(1),
145+
eval_embeddings.unsqueeze(0),
146+
dim=2,
147+
),
148+
min=0.0,
149+
max=1.0,
150+
)
151+
probabilities = torch.nn.functional.softmax(similarities / 0.01, dim=1)
152+
entropy_values = -torch.sum(probabilities * torch.log(probabilities + 1e-10), dim=1)
153+
max_values, _ = torch.max(similarities, dim=1)
154+
155+
if dataset_name not in results:
156+
results[dataset_name] = {}
157+
if benchmark_name not in results[dataset_name]:
158+
results[dataset_name][benchmark_name] = {}
159+
160+
for j, entity in enumerate(batch):
161+
if entity not in results[dataset_name][benchmark_name]:
162+
results[dataset_name][benchmark_name][entity] = {}
163+
results[dataset_name][benchmark_name][entity]["entropy"] = entropy_values[j].cpu().numpy().item()
164+
results[dataset_name][benchmark_name][entity]["max"] = max_values[j].cpu().numpy().item()
165+
166+
entries = []
167+
for dataset_name, eval_comparisons in results.items():
168+
for benchmark_name, mapping in eval_comparisons.items():
169+
for entity, values in mapping.items():
170+
entries.append(
171+
{
172+
"entity": entity,
173+
"entropy": values["entropy"],
174+
"max": values["max"],
175+
"eval_dataset": benchmark_name,
176+
"train_dataset": dataset_name,
177+
}
178+
)
179+
df = pd.DataFrame.from_dict(entries, orient="columns")
180+
df.to_pickle("new_splits.pkl")

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[project]
2-
name = "familarity"
2+
name = "familiarity"
33
dynamic = ["version"]
4-
description = "Estimating label shift and transfer difficulty using Familarity."
4+
description = "Estimating label shift and transfer difficulty using Familiarity."
55
authors = [{ name = "Jonas Golde", email = "jonas.max.golde@hu-berlin.de" }]
66
readme = "README.md"
77
requires-python = ">3.8"
@@ -33,7 +33,7 @@ testing = ["pytest"]
3333
dev = ["black", "isort", "ruff"]
3434

3535
[tool.setuptools]
36-
packages = ["familarity"]
36+
packages = ["familiarity"]
3737
package-dir = { "" = "src" }
3838

3939
[tool.black]

src/familarity/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from familarity.metric import compute_metric
1+
from familiarity.metric import compute_metric
22

33
__all__ = ["compute_metric"]

src/familarity/embedding_models.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,12 @@
66

77
import numpy as np
88
import torch
9+
from familiarity.utils import get_device
910
from huggingface_hub import repo_exists
1011
from sentence_transformers import SentenceTransformer
1112
from tqdm import tqdm
1213
from transformers import AutoModel, AutoTokenizer
1314

14-
from familarity.utils import get_device
15-
1615

1716
class LabelEmbeddingModel(ABC):
1817
def __init__(self):

src/familarity/metric.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,17 @@
55

66
import numpy as np
77
import pandas as pd
8-
from tqdm import tqdm
9-
10-
from familarity.embedding_models import LabelEmbeddingModel, load_embedding_model
11-
from familarity.logger import setup_logger
12-
from familarity.utils import (
8+
from familiarity.embedding_models import LabelEmbeddingModel, load_embedding_model
9+
from familiarity.logger import setup_logger
10+
from familiarity.utils import (
1311
clipped_cosine_similarity,
1412
combine_counters,
1513
cumsum_until,
1614
df_to_prettytable,
1715
iterate_dict_in_batches,
1816
make_output_path,
1917
)
18+
from tqdm import tqdm
2019

2120

2221
def compute_embeddings(
@@ -82,29 +81,29 @@ def compute_similarities(
8281
return similarity_df
8382

8483

85-
def compute_familarity(
84+
def compute_familiarity(
8685
similarity_df: pd.DataFrame,
8786
k: int = 1000,
8887
weighting: str = "zipf",
8988
output_path: Path = None,
9089
save_embeddings: bool = False,
9190
) -> pd.DataFrame:
92-
familarity_data = []
91+
familiarity_data = []
9392

9493
for label_test in similarity_df["label_test"].unique():
9594
test_label_df = similarity_df[similarity_df["label_test"] == label_test]
9695
test_label_df = test_label_df.sort_values("similarity", ascending=False)
9796
counts = cumsum_until(test_label_df["count_train"], k)
9897
sims = test_label_df["similarity"][: len(counts)]
99-
familarity = weighted_average(sims, counts, k, weighting=weighting)
100-
familarity_data.append({"label": label_test, "familarity": familarity})
98+
familiarity = weighted_average(sims, counts, k, weighting=weighting)
99+
familiarity_data.append({"label": label_test, "familiarity": familiarity})
101100

102-
familarity_df = pd.DataFrame(familarity_data)
101+
familiarity_df = pd.DataFrame(familiarity_data)
103102

104103
if save_embeddings:
105-
familarity_df.to_pickle(output_path / "familarity_df.pkl")
104+
familiarity_df.to_pickle(output_path / "familiarity_df.pkl")
106105

107-
return familarity_df
106+
return familiarity_df
108107

109108

110109
def weighted_average(
@@ -165,8 +164,8 @@ def compute_metric(
165164
)
166165

167166
similarity_df = compute_similarities(embedding_df, output_path=output_path, save_embeddings=save_embeddings)
168-
familarity_df = compute_familarity(
167+
familiarity_df = compute_familiarity(
169168
similarity_df, k=k, weighting=weighting, output_path=output_path, save_embeddings=save_embeddings
170169
)
171170
logger.info("Results:\n")
172-
logger.info(df_to_prettytable(familarity_df))
171+
logger.info(df_to_prettytable(familiarity_df))

src/familarity/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def df_to_prettytable(df: pd.DataFrame) -> PrettyTable:
3636
for idx, row in df.iterrows():
3737
table.add_row(row, divider=True if idx + 1 == len(df) else False)
3838

39-
table.add_row(["Marco-Avg. Familarity", round(df["familarity"].mean().item(), 3)])
39+
table.add_row(["Marco-Avg. Familiarity", round(df["familiarity"].mean().item(), 3)])
4040

4141
return table
4242

tests/conftest.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import numpy as np
22
import pytest
3-
4-
from familarity.embedding_models import LabelEmbeddingModel
3+
from familiarity.embedding_models import LabelEmbeddingModel
54

65

76
@pytest.fixture(scope="module")

tests/test_embedding_models.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import numpy as np
22
import pytest
3-
4-
from familarity.embedding_models import (
3+
from familiarity.embedding_models import (
54
FastTextModel,
65
GloveModel,
76
SentenceTransformerModel,

tests/test_logger.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from pathlib import Path
22

3-
from familarity.logger import setup_logger
3+
from familiarity.logger import setup_logger
44

55

66
def test_setup_logger(tmp_path: Path, capsys, caplog):

0 commit comments

Comments
 (0)