task_dimensionality_reduction/src/metrics/clustering_performance/config.vsh.yaml at aef1d1d22f0f15a9e1dc22c17fece27fac28048b · openproblems-bio/task_dimensionality_reduction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# Base component API configuration
__merge__: ../../api/comp_metric.yaml

# Component configuration
name: clustering_performance
info:
  metrics:
    - name: normalized_mutual_information
      label: NMI
      summary: Normalized Mutual Information (NMI) is a measure of the concordance between clustering obtained from the reduced-dimensional embeddings and the cell labels.
      description: |
        The Normalized Mutual Information (NMI) is a measure of the similarity between cluster labels obtained from the clustering of dimensionality reduction embeddings and the true cell labels. It is a normalization of the Mutual Information (MI) score to scale the results between 0 (no mutual information) and 1 (perfect correlation).
        Mutual Information quantifies the "amount of information" obtained about one random variable by observing the other random variable. Assuming two label assignments X and Y, it is given by:
          $MI(X,Y) = \sum_{x=1}^{X}\sum_{y=1}^{Y}p(x,y)log(\frac{P(x,y)}{P(x)P'(y)})$,
        where P(x,y) is the joint probability mass function of X and Y, and P(x), P'(y) are the marginal probability mass functions of X and Y respectively. The mutual information is normalized by some generalized mean of H(X) and H(Y). Therefore, Normalized Mutual Information can be defined as:
          $NMI(X,Y) = \frac{MI(X,Y)}{mean(H(X),H(Y))}$,
        where H(X) and H(Y) are the entropies of X and Y respectively. Higher NMI score suggests that the method is effective in preserving relevant information.
      references:
        doi: 10.1371/journal.pone.0159161
      links:
        documentation: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.normalized_mutual_info_score.html
        repository: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.normalized_mutual_info_score.html
      min: 0
      max: 1
      maximize: true
    - name: adjusted_rand_index
      label: ARI
      summary: Adjusted Rand Index (ARI) is a measure of the similarities between two cluster assignments of the reduced-dimensional embeddings and the true cell types.
      description: |
        Adjusted Rand Index (ARI) is a measure of similarity between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted (from the reduced dimensional embeddings) and true clusterings (cell type labels). It is the Rand Index (RI) adjusted for chance.
        Assuming the C as the cell type labels and K as the clustering of the reduced dimensional embedding, Rand Index can be defined as:
          $RI = \frac{a + b}{{C}_{2}^{n_{samples}}}$,
        where 'a' is the number of pairs of elements that are in the same set in C and in the same set in K, 'b' is the number of pairs of elements that are in different sets in C and in different sets in K, and ${C}_{2}^{n_{samples}}$ is the total number of possible pairs in the dataset. Random label assignments can be discounted as follows:
          $ARI = \frac{RI - E[RI]}{max(RI) - E[RI]}$,
        where E[RI] is the expected RI of random labellings.
      references:
        bibtex: |
          @InProceedings{santos2009on,
            author = {Santos, Jorge M. and Embrechts, Mark"},
            editor = {Alippi, Cesare and Polycarpou, Marios and Panayiotou, Christos and Ellinas, Georgios},
            title = {On the Use of the Adjusted Rand Index as a Metric for Evaluating Supervised Classification},
            booktitle = {Artificial Neural Networks -- ICANN 2009},
            year = {2009},
            publisher = {Springer Berlin Heidelberg},
            address = {Berlin, Heidelberg},
            pages = {175--184},
            isbn = {978-3-642-04277-5},
            doi = {10.1007/978-3-642-04277-5_18},
            url = {https://doi.org/10.1007/978-3-642-04277-5_18}
          }
      links:
        documentation: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html#sklearn.metrics.adjusted_rand_score
        repository: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html#sklearn.metrics.adjusted_rand_score
      min: 0
      max: 1
      maximize: true

# Script configuration
arguments:
  - name: "--nmi_avg_method"
    type: string
    default: arithmetic
    description: Method to compute normalizer in the denominator for normalized mutual information score calculation.
    choices: [min, geometric, arithmetic, max]
resources:
  - type: python_script
    path: script.py

# Platform configuration
engines:
  - type: docker
    image: openproblems/base_python:1
    setup:
      - type: python
        packages: [scikit-learn, scanpy, leidenalg]
runners:
  - type: executable
  - type: nextflow
    directives:
      label: [midtime, midmem, lowcpu]