@@ -1372,6 +1372,14 @@ def unlabeled_identifier(self) -> str:
13721372
13731373
13741374class Corpus (typing .Generic [T_co ]):
1375+ """The main object in Flair for holding a dataset used for training and testing.
1376+
1377+ A corpus consists of three splits: A `train` split used for training, a `dev` split used for model selection
1378+ and/or early stopping and a `test` split used for testing. All three splits are optional, so it is possible
1379+ to create a corpus only using one or two splits. If the option `sample_missing_splits` is set to True,
1380+ missing splits will be randomly sampled from the training split.
1381+ """
1382+
13751383 def __init__ (
13761384 self ,
13771385 train : Optional [Dataset [T_co ]] = None ,
@@ -1381,6 +1389,26 @@ def __init__(
13811389 sample_missing_splits : Union [bool , str ] = True ,
13821390 random_seed : Optional [int ] = None ,
13831391 ) -> None :
1392+ """
1393+ Constructor method to initialize a :class:`Corpus`. You can define the train, dev and test split
1394+ by passing the corresponding Dataset object to the constructor. At least one split should be defined.
1395+ If the option `sample_missing_splits` is set to True, missing splits will be randomly sampled from the
1396+ train split.
1397+
1398+ In most cases, you will not use the constructor yourself. Rather, you will create a corpus using one of our
1399+ helper methods that read common NLP filetypes. For instance, you can use
1400+ :class:`flair.datasets.sequence_labeling.ColumnCorpus` to read CoNLL-formatted files directly into
1401+ a :class:`Corpus`.
1402+
1403+ Args:
1404+ train: The split you use for model training.
1405+ dev: A holdout split typically used for model selection or early stopping.
1406+ test: The final test data to compute the score of the model.
1407+ name: A name that identifies the corpus.
1408+ sample_missing_splits: If set to True, missing splits are sampled from train. If set to False,
1409+ missing splits are not sampled and left empty. Default: True.
1410+ random_seed: Set a random seed to control the sampling of missing splits.
1411+ """
13841412 # set name
13851413 self .name : str = name
13861414
@@ -1419,14 +1447,17 @@ def __init__(
14191447
14201448 @property
14211449 def train (self ) -> Optional [Dataset [T_co ]]:
1450+ """The training split as a :class:`torch.utils.data.Dataset` object."""
14221451 return self ._train
14231452
14241453 @property
14251454 def dev (self ) -> Optional [Dataset [T_co ]]:
1455+ """The dev split as a :class:`torch.utils.data.Dataset` object."""
14261456 return self ._dev
14271457
14281458 @property
14291459 def test (self ) -> Optional [Dataset [T_co ]]:
1460+ """The test split as a :class:`torch.utils.data.Dataset` object."""
14301461 return self ._test
14311462
14321463 def downsample (
@@ -1443,12 +1474,12 @@ def downsample(
14431474 data points. It additionally returns a pointer to itself for use in method chaining.
14441475
14451476 Args:
1446- percentage (float) : A float value between 0. and 1. that indicates to which percentage the corpus
1477+ percentage: A float value between 0. and 1. that indicates to which percentage the corpus
14471478 should be downsampled. Default value is 0.1, meaning it gets downsampled to 10%.
1448- downsample_train (bool) : Whether or not to include the training split in downsampling. Default is True.
1449- downsample_dev (bool) : Whether or not to include the dev split in downsampling. Default is True.
1450- downsample_test (bool) : Whether or not to include the test split in downsampling. Default is True.
1451- random_seed (int) : An optional random seed to make downsampling reproducible.
1479+ downsample_train: Whether or not to include the training split in downsampling. Default is True.
1480+ downsample_dev: Whether or not to include the dev split in downsampling. Default is True.
1481+ downsample_test: Whether or not to include the test split in downsampling. Default is True.
1482+ random_seed: An optional random seed to make downsampling reproducible.
14521483
14531484 Returns:
14541485 A pointer to itself for optional use in method chaining.
@@ -1580,9 +1611,17 @@ def _downsample_to_proportion(dataset: Dataset, proportion: float, random_seed:
15801611 return splits [0 ]
15811612
15821613 def obtain_statistics (self , label_type : Optional [str ] = None , pretty_print : bool = True ) -> Union [dict , str ]:
1583- """Print statistics about the class distribution and sentence sizes .
1614+ """Print statistics about the corpus, including the length of the sentences and the labels in the corpus .
15841615
1585- only labels of sentences are taken into account
1616+ Args:
1617+ label_type: Optionally set this value to obtain statistics only for one specific type of label (such
1618+ as "ner" or "pos"). If not set, statistics for all labels will be returned.
1619+ pretty_print: If set to True, returns pretty json (indented for readabilty). If not, the json is
1620+ returned as a single line. Default: True.
1621+
1622+ Returns:
1623+ If pretty_print is True, returns a pretty print formatted string in json format. Otherwise, returns a
1624+ dictionary holding a json.
15861625 """
15871626 json_data = {
15881627 "TRAIN" : self ._obtain_statistics_for (self .train , "TRAIN" , label_type ),
@@ -1654,7 +1693,21 @@ def make_label_dictionary(
16541693 ) -> Dictionary :
16551694 """Creates a dictionary of all labels assigned to the sentences in the corpus.
16561695
1657- :return: dictionary of labels
1696+ Args:
1697+ label_type: The name of the label type for which the dictionary should be created. Some corpora have
1698+ multiple layers of annotation, such as "pos" and "ner". In this case, you should choose the label type
1699+ you are interested in.
1700+ min_count: Optionally set this to exclude rare labels from the dictionary (i.e., labels seen fewer
1701+ than the provided integer value).
1702+ add_unk: Optionally set this to True to include a "UNK" value in the dictionary. In most cases, this
1703+ is not needed since the label dictionary is well-defined, but some use cases might have open classes
1704+ and require this.
1705+ add_dev_test: Optionally set this to True to construct the label dictionary not only from the train
1706+ split, but also from dev and test. This is only necessary if some labels never appear in train but do
1707+ appear in one of the other splits.
1708+
1709+ Returns:
1710+ A Dictionary of all unique labels in the corpus.
16581711 """
16591712 if min_count > 0 and not add_unk :
16601713 add_unk = True
@@ -1833,13 +1886,25 @@ def add_label_noise(
18331886 )
18341887
18351888 def get_label_distribution (self ):
1889+ """Counts occurrences of each label in the corpus and returns them as a dictionary object.
1890+
1891+ This allows you to get an idea of which label appears how often in the Corpus.
1892+
1893+ Returns:
1894+ Dictionary with labels as keys and their occurrences as values.
1895+ """
18361896 class_to_count = defaultdict (lambda : 0 )
18371897 for sent in self .train :
18381898 for label in sent .labels :
18391899 class_to_count [label .value ] += 1
18401900 return class_to_count
18411901
18421902 def get_all_sentences (self ) -> ConcatDataset :
1903+ """Returns all sentences (spanning all three splits) in the :class:`Corpus`.
1904+
1905+ Returns:
1906+ A :class:`torch.utils.data.Dataset` object that includes all sentences of this corpus.
1907+ """
18431908 parts = []
18441909 if self .train :
18451910 parts .append (self .train )
0 commit comments