naist-nlp
diff --git a/‎entity_linkings/__init__.py‎
Lines changed: 154 additions & 0 deletions b/‎entity_linkings/__init__.py‎
Lines changed: 154 additions & 0 deletions
diff --git a/‎entity_linkings/cli/evaluate.py‎
Lines changed: 95 additions & 0 deletions b/‎entity_linkings/cli/evaluate.py‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎entity_linkings/cli/evaluate_retrieval.py‎
Lines changed: 79 additions & 0 deletions b/‎entity_linkings/cli/evaluate_retrieval.py‎
Lines changed: 79 additions & 0 deletions
@@ -0,0 +1,154 @@
+import os
+from typing import Optional, Union
+
+import datasets
+from datasets import Dataset, DatasetDict
+
+from .dataset import DATASET_ID2CLS
+from .entity_dictionary import DICTIONARY_ID2CLS, EntityDictionaryBase
+from .models import (
+    ED_ID2CLS,
+    EL_ID2CLS,
+    RETRIEVER_ID2CLS,
+    EntityRetrieverBase,
+    PipelineBase,
+)
+
+
+def get_dataset_ids() -> list[str]:
+    '''Generate a list of ids with the class name in lower case.
+    '''
+    ids = list(DATASET_ID2CLS.keys())
+    return ids
+
+
+def get_dictionary_ids() -> list[str]:
+    '''Generate a list of ids with the class name in lower case.
+    '''
+    ids = list(DICTIONARY_ID2CLS.keys())
+    return ids
+
+
+def get_retriever_ids() -> list[str]:
+    '''Generate a list of ids with the class name in lower case.
+    '''
+    ids = list(RETRIEVER_ID2CLS.keys())
+    return ids
+
+
+def get_el_ids() -> list[str]:
+    '''Generate a list of ids with the class name in lower case.
+    '''
+    ids = list(EL_ID2CLS.keys())
+    return ids
+
+
+def get_ed_ids() -> list[str]:
+    '''Generate a list of ids with the class name in lower case.
+    '''
+    ids = list(ED_ID2CLS.keys())
+    return ids
+
+
+def get_model_ids() -> list[str]:
+    '''Generate a list of ids with the class name in lower case.
+    '''
+    ids = list(RETRIEVER_ID2CLS.keys()) + list(EL_ID2CLS.keys()) + list(ED_ID2CLS.keys())
+    return ids
+
+
+def load_dataset(
+        name: str = "json",
+        data_files: Optional[Union[str, dict[str, str]]] = None,
+        split: Optional[str] = None,
+        cache_dir: Optional[str] = None
+    ) -> Union[DatasetDict, Dataset]:
+    '''Generate a dataset class with the class name in lower case as the key.
+    If the name is not found, use the custom dataset class.
+    For custom dataset, data_files must be provided.
+    '''
+    if name == "json":
+        if not data_files:
+            raise ValueError("Either name or data_files must be provided.")
+        dataset = datasets.load_dataset("json", data_files=data_files, cache_dir=cache_dir)
+    else:
+        subset = str(name.split('-')[1]) if '-' in name else None
+        name = name.split('-')[0]
+        if name not in get_dataset_ids():
+            raise ValueError(f"The id should be one of {get_dataset_ids()}.")
+        dataset_cls = DATASET_ID2CLS[name]
+        if subset:
+            dataset_cls(config_name=subset, cache_dir=cache_dir).download_and_prepare()
+            dataset = dataset_cls(config_name=subset, cache_dir=cache_dir).as_dataset()
+        else:
+            dataset_cls(cache_dir=cache_dir).download_and_prepare()
+            dataset = dataset_cls(cache_dir=cache_dir).as_dataset()
+
+    if split is not None:
+        return dataset[split]
+    return dataset
+
+
+def load_dictionary(
+        dictionary_name_or_path: str,
+        nil_id: str = "-1",
+        nil_name: str = "[NIL]",
+        nil_description: str = "[NIL] is an entity that does not exist in this dictionary.",
+        default_description: str = """{name} is an entity in this dictionary.""",
+        cache_dir: Optional[str|os.PathLike] = None,
+    ) -> EntityDictionaryBase:
+    '''Generate a dictionary of ids and classes with the class name in lower case as the key.
+    '''
+    if os.path.isfile(dictionary_name_or_path):
+        dictionary = datasets.load_dataset("json", data_files=dictionary_name_or_path, cache_dir=cache_dir, split="train")
+    else:
+        if dictionary_name_or_path not in get_dictionary_ids():
+            raise ValueError(f"The id should be one of {get_dictionary_ids()}.")
+        dictionary_cls = DICTIONARY_ID2CLS[dictionary_name_or_path]
+        dictionary_cls(cache_dir=cache_dir).download_and_prepare()
+        dictionary = dictionary_cls(cache_dir=cache_dir).as_dataset()['dictionary']
+
+    return EntityDictionaryBase(dictionary=dictionary, config=EntityDictionaryBase.Config(
+        nil_id=nil_id,
+        nil_name=nil_name,
+        nil_description=nil_description,
+        default_description=default_description,
+        cache_dir=cache_dir,
+    ))
+
+
+def get_ed_models(name: str) -> type[PipelineBase]:
+    '''Generate a dictionary of ids and classes with the class name in lower case as the key.
+    '''
+    if name not in get_ed_ids():
+        raise ValueError(f"The id should be one of {get_el_ids()}.")
+    return ED_ID2CLS[name]
+
+
+def get_el_models(name: str) -> type[PipelineBase]:
+    '''Generate a dictionary of ids and classes with the class name in lower case as the key.
+    '''
+    if name not in get_el_ids():
+        raise ValueError(f"The id should be one of {get_el_ids()}.")
+    return EL_ID2CLS[name]
+
+
+def get_retrievers(name: str) -> type[EntityRetrieverBase]:
+    '''Generate a retriever model class.
+    If without_span is True, use SentenceRetrieval class.
+    Otherwise, use Retrieval class.
+    '''
+    if name not in get_retriever_ids():
+        raise ValueError(f"The id should be one of {get_retriever_ids()}.")
+    return RETRIEVER_ID2CLS[name]
+
+
+def get_models(name: str) -> type[PipelineBase]:
+    '''Generate a dictionary of ids and classes with the class name in lower case as the key.
+    '''
+    if name in get_el_ids():
+        return EL_ID2CLS[name]
+    elif name in get_ed_ids():
+        return ED_ID2CLS[name]
+    else:
+        raise ValueError(f"The id should be one of {get_retriever_ids() + get_el_ids() + get_ed_ids()}.")
@@ -0,0 +1,95 @@
+import json
+import logging
+import os
+from argparse import ArgumentParser, Namespace
+
+import torch
+
+from entity_linkings import get_models, get_retrievers, load_dataset, load_dictionary
+from entity_linkings.utils import read_yaml
+
+logger = logging.getLogger(__name__)
+
+device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
+
+def evaluate(args: Namespace) -> None:
+    dictionary = load_dictionary(args.dictionary_id_or_path, cache_dir=args.cache_dir)
+    dataset_id = args.dataset_id if args.dataset_id else "json"
+    if dataset_id != "json":
+        test_dataset = load_dataset(dataset_id, split='test', cache_dir=args.cache_dir)
+    else:
+        test_dataset = load_dataset("json", data_files={"test": args.test_file}, cache_dir=args.cache_dir)['test']
+    if args.remove_nil:
+        from entity_linkings.data_utils import filter_nil_entities
+        test_dataset = filter_nil_entities(test_dataset, dictionary)
+
+    if args.retriever_config is not None:
+        retriever_config = read_yaml(args.retriever_config)[args.retriever_id.lower()]
+    else:
+        retriever_config = {}
+
+    if args.model_config is not None:
+        model_config = read_yaml(args.model_config).get(args.model_id, {})
+    else:
+        model_config = {}
+    if args.model_name_or_path is not None:
+        model_config["model_name_or_path"] = args.model_name_or_path
+
+    if args.wandb:
+        import wandb
+        wandb.init(
+            project=os.environ.get("WANDB_PROJECT", "entity_linking_benchmark"),
+            name=args.model_id, tags=["evaluation"]
+        )
+        wandb.log({
+            "model_type": args.model_type,
+            "retriever_id": args.retriever_id,
+            "model_name_or_path": model_config.get("model_name_or_path", None),
+            "retriever_model_name_or_path": retriever_config.get("model_name_or_path", None),
+            "remove_nil": args.remove_nil,
+            "dataset_id": dataset_id,
+            "test_file": args.test_file,
+            "dictionary_id_or_path": args.dictionary_id_or_path,
+        })
+
+    retriever_cls = get_retrievers(args.retriever_id)
+    retriever = retriever_cls(dictionary, config=retriever_cls.Config(**retriever_config))
+    model_cls = get_models(args.model_id)
+    model = model_cls(retriever, config=model_cls.Config(**model_config))
+
+    metrics = model.evaluate(test_dataset, num_candidates=args.num_candidates, batch_size=args.test_batch_size)
+    logger.info(f"Evaluation results: {metrics}")
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+        output_path = f"{args.output_dir}/eval_results.json"
+        with open(output_path, "w") as f:
+            json.dump(metrics, f, indent=4)
+        logger.info(f"Saved evaluation results to {output_path}")
+    if args.wandb:
+        for key, value in metrics.items():
+            wandb.log({key: value})
+
+
+def cli_main() -> None:
+    parser = ArgumentParser()
+    parser.add_argument('--model_type', type=str, default='ed', help='Task to perform. "ed" (entity disambiguation) and "el" (entity linking) are supported.')
+    parser.add_argument('--model_id', type=str, required=True, help='Name of the model to use.')
+    parser.add_argument('--model_name_or_path', type=str, default=None, help='Name of the model to use.')
+    parser.add_argument('--retriever_id', type=str, required=True, help='Name of the retriever model to use.')
+    parser.add_argument('--retriever_model_name_or_path', type=str, default=None, help='Name of the retriever model to use.')
+    parser.add_argument('--dictionary_id_or_path', '-d', type=str, default=None, help='Path to the entity dictionary file.')
+    parser.add_argument('--dataset_id', '-D', type=str, default=None, help='Name of the dataset to use.')
+    parser.add_argument('--test_file', type=str, default=None, help='Path to the dataset file.')
+    parser.add_argument('--num_candidates', type=int, default=5, help='Number of candidate entities to consider during evaluation.')
+    parser.add_argument('--test_batch_size', type=int, default=32, help='Batch size for evaluation.')
+    parser.add_argument('--remove_nil', action='store_true', default=False, help='Whether to remove nil entities from the dataset.')
+    parser.add_argument('--output_dir', type=str, default=None, help='Path to the output directory.')
+    parser.add_argument("--cache_dir", type=str, default=None, help='Path to the cache directory.')
+    parser.add_argument('--model_config', type=str, default=None, help='YAML-based config file.')
+    parser.add_argument('--retriever_config', type=str, default=None, help='YAML-based retriever config file.')
+    parser.add_argument('--wandb', action='store_true', default=False, help='Whether to use wandb for logging.')
+    args = parser.parse_args()
+    evaluate(args)
+
+if __name__ == "__main__":
+    cli_main()
@@ -0,0 +1,79 @@
+import json
+import logging
+import os
+from argparse import ArgumentParser, Namespace
+
+import torch
+
+from entity_linkings import get_retrievers, load_dataset, load_dictionary
+from entity_linkings.utils import read_yaml
+
+logger = logging.getLogger(__name__)
+
+device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
+
+def evaluate(args: Namespace) -> None:
+    dictionary = load_dictionary(args.dictionary_id_or_path, cache_dir=args.cache_dir)
+    dataset_id = args.dataset_id if args.dataset_id else "json"
+    if dataset_id != "json":
+        test_dataset = load_dataset(dataset_id, split='test', cache_dir=args.cache_dir)
+    else:
+        test_dataset = load_dataset("json", data_files={"test": args.test_file}, cache_dir=args.cache_dir)['test']
+    if args.remove_nil:
+        from entity_linkings.data_utils import filter_nil_entities
+        test_dataset = filter_nil_entities(test_dataset, dictionary)
+
+    if args.wandb:
+        import wandb
+        wandb.init(
+            project=os.environ.get("WANDB_PROJECT", "entity_linkings"),
+            name=args.retriever_id, tags=["evaluation"]
+        )
+        wandb.log({
+            "retriever_id": args.retriever_id,
+            "dataset_id": dataset_id,
+            "dictionary_id_or_path": args.dictionary_id_or_path,
+            "model_name_or_path": args.retriever_model_name_or_path,
+            "remove_nil": args.remove_nil
+        })
+
+    if args.retriever_config is not None:
+        retriever_config = read_yaml(args.retriever_config).get(args.retriever_id, {})
+    else:
+        retriever_config = {}
+    if args.retriever_model_name_or_path is not None:
+        retriever_config["model_name_or_path"] = args.retriever_model_name_or_path
+
+    retriever_cls = get_retrievers(args.retriever_id)
+    model = retriever_cls(dictionary=dictionary, config=retriever_cls.Config(**retriever_config))
+    metrics = model.evaluate(test_dataset, batch_size=args.test_batch_size)
+    logger.info(f"Evaluation results: {metrics}")
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+        output_path = f"{args.output_dir}/eval_results.json"
+        with open(output_path, "w") as f:
+            json.dump(metrics, f, indent=4)
+        logger.info(f"Saved evaluation results to {output_path}")
+    if args.wandb:
+        for key, value in metrics.items():
+            wandb.log({key: value})
+
+
+def cli_main() -> None:
+    parser = ArgumentParser()
+    parser.add_argument('--retriever_id', type=str, required=True, help='Name of the retriever model to use.')
+    parser.add_argument('--retriever_model_name_or_path', type=str, default=None, help='Name of the model to use.')
+    parser.add_argument('--dictionary_id_or_path', type=str, default=None, help='Path to the entity dictionary file.')
+    parser.add_argument('--dataset_id', type=str, default=None, help='Name of the dataset to use.')
+    parser.add_argument('--test_file', type=str, default=None, help='Path to the dataset file.')
+    parser.add_argument('--test_batch_size', type=int, default=32, help='Batch size for evaluation.')
+    parser.add_argument('--remove_nil', action='store_true', default=False, help='Whether to remove nil entities from the dataset.')
+    parser.add_argument('--output_dir', type=str, default=None, help='Path to the output directory.')
+    parser.add_argument("--cache_dir", type=str, default=None, help='Path to the cache directory.')
+    parser.add_argument('--retriever_config', type=str, default=None, help='YAML-based config file.')
+    parser.add_argument('--wandb', action='store_true', default=False, help='Whether to use wandb for logging.')
+    args = parser.parse_args()
+    evaluate(args)
+
+if __name__ == "__main__":
+    cli_main()