diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f146d20..5e757c6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -20,6 +20,7 @@ jobs: shell: bash -el {0} run: | conda install pytest pytest-cov coveralls ${{ matrix.conda-deps }} + pip install upset-alttxt==0.5.2 python setup.py install cp ci/matplotlibrc matplotlibrc - name: test diff --git a/README.rst b/README.rst index e56155e..2902314 100644 --- a/README.rst +++ b/README.rst @@ -130,6 +130,7 @@ Installation requires: * pandas * matplotlib >= 2.0 * seaborn to use `UpSet.add_catplot` +* upset-alttxt v0.5.2 to use `UpSet.get_alt_text` It should then be possible to:: diff --git a/doc/requirements.txt b/doc/requirements.txt index aa52f7d..ae049f7 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -10,3 +10,4 @@ scikit-learn nbsphinx sphinx<2 sphinx-rtd-theme +upset-alttxt diff --git a/examples/plot_alt_text.py b/examples/plot_alt_text.py new file mode 100644 index 0000000..e2dd6fa --- /dev/null +++ b/examples/plot_alt_text.py @@ -0,0 +1,72 @@ +""" +========================================== +Data Vis: Alt text generation in UpSetPlot +========================================== + +Explore text description generation via upset-alttxt (2025). + +When text description generation is enabled, there are no changes to the actual plot. +The generated text description can be accessed after creating the UpSet plot object. + +""" + +from matplotlib import pyplot as plt + +from upsetplot import UpSet, generate_counts + +# Load the dataset into a DataFrame +example = generate_counts() + +########################################################################## + +print("Generating a plot AND grammar for textual description") +upset = UpSet( + example, + subset_size="count", + sort_by="-cardinality", + sort_categories_by="-cardinality", + orientation="vertical", + gen_grammar=True, +) +upset.plot() +plt.suptitle("UpSet plot with text description generated'") +text_description = upset.get_alt_text() + +print("==================================") +print("Long Description (markdown formatted)") +print("==================================") +print(text_description["longDescription"]) + +print("==================================") +print("Short Description") +print("==================================") +print(text_description["shortDescription"]) + +print("\n==================================") +print("Technique Description") +print("==================================") +print(text_description["techniqueDescription"]) + +plt.show() + + +print("\nNow to generate the same plot with no alt text generation") + +# To disable grammar generation, simply ignore the gen_grammar parameter or set it to False. +upset = UpSet( + example, + subset_size="count", + sort_by="-cardinality", + sort_categories_by="-cardinality", + orientation="vertical", +) + +upset.plot() +plt.suptitle("UpSet plot with no alt text generation") + +try: + text_description = upset.get_alt_text() +except ValueError: + print("gen_grammar must be set to True for any alt text generation.") + +plt.show() diff --git a/upsetplot/__init__.py b/upsetplot/__init__.py index 4cb4364..07d9955 100644 --- a/upsetplot/__init__.py +++ b/upsetplot/__init__.py @@ -3,6 +3,10 @@ import os if os.environ.get("__IN-SETUP", None) != "1": + from .alt_text import ( + fetch_alt_text, + generate_grammar, + ) from .data import ( from_contents, from_indicators, @@ -24,4 +28,6 @@ "from_contents", "from_indicators", "query", + "generate_grammar", + "fetch_alt_text", ] diff --git a/upsetplot/alt_text.py b/upsetplot/alt_text.py new file mode 100644 index 0000000..ab2ca23 --- /dev/null +++ b/upsetplot/alt_text.py @@ -0,0 +1,367 @@ +from alttxt.enums import Level +from alttxt.generator import AltTxtGen +from alttxt.parser import Parser +from alttxt.tokenmap import TokenMap + +""" + alt_text.py + ----------- + This file contains various conversions from the datatypes, structures, + and values from UpSetPlot to Upset2 at upset.multinet.app. + This is necessary to generate alt text with the Multinet API. +""" + + +def generate_grammar( + df, + intersections, + totals, + *, + horizontal, + sort_by, + sort_categories_by, + min_degree=None, + max_degree=None, + include_empty_subsets=False, + include_data=False, + meta_data=None, +): + """ + Generate the grammar used by UpSet 2 and Multinet to generate alt text. + + Some values present in UpSet 2 will not be present in all implementations (e.g. Aggregation), so default values will be provided here + + .. versionadded:: 0.10 + + Parameters + ---------- + df : :class:`pandas.core.frame.DataFrame` + The UpSet data DataFrame. + intersections : :class:`pandas.core.series.Series` + The list of intersections. + totals : :class:`pandas.core.series.Series` + The list of totals. + horizontal: bool + Plot orientation. + sort_by : str + The attribute to sort the sets by. + sort_categories_by : str + The attribute to sort the categories by. + min_degree : number, optional + The minimum degree (optional). + max_degree : number, optional + The maximum degree (optional). + include_empty_subsets : bool, default=False + Whether to include empty subsets (default: False). + include_data : bool, default=False + Whether to include subset data (default: False). + + Returns + ------- + The generated grammar as a dictionary. + """ + + # default grammar state values required by UpSet 2/Multinet + grammar = { + "version": "0.1.0", # alt-text grammar version + "plotInformation": { + "title": "", + "caption": "", + "description": "", + "sets": "", + "items": "", + }, + "horizontal": False, + "firstAggregateBy": "None", + "firstOverlapDegree": 2, + "secondAggregateBy": "None", + "secondOverlapDegree": 2, + "sortVisibleBy": "Alphabetical", + "sortBy": "Size", + "filters": { + "maxVisible": 6, + "minVisible": 0, + "hideEmpty": True, + "hideNoSet": False, + }, + "visibleSets": [], + "visibleAttributes": [], + "bookmarks": [], + "collapsed": [], + "plots": {"scatterplots": [], "histograms": [], "wordClouds": []}, + "allSets": [], + } + + grammar["horizontal"] = horizontal + + if meta_data is not None: + grammar["plotInformation"]["title"] = meta_data.get("title", "") + grammar["plotInformation"]["caption"] = meta_data.get("caption", "") + grammar["plotInformation"]["description"] = meta_data.get("description", "") + grammar["plotInformation"]["sets"] = meta_data.get("sets", "") + grammar["plotInformation"]["items"] = meta_data.get("items", "") + + if sort_by == "degree": + grammar["sortBy"] = "Degree" + grammar["sortByOrder"] = "Descending" + if sort_by == "-degree": + grammar["sortBy"] = "Degree" + grammar["sortByOrder"] = "Ascending" + if sort_by == "cardinality": + grammar["sortBy"] = "Size" + grammar["sortByOrder"] = "Ascending" + if sort_by == "-cardinality": + grammar["sortBy"] = "Size" + grammar["sortByOrder"] = "Descending" + # this sort type is not supported by UpSet 2 + if sort_by == "input" or sort_by == "-input": + grammar["sortBy"] = "Size" + grammar["sortByOrder"] = "Descending" + + if sort_categories_by == "cardinality": + grammar["sortVisibleBy"] = "Descending" + if sort_categories_by == "-cardinality": + grammar["sortVisibleBy"] = "Ascending" + # this category sort type is not supported by UpSet 2 + if sort_categories_by == "input" or sort_categories_by == "-input": + grammar["sortVisibleBy"] = "Alphabetical" + + grammar["filters"]["hideEmpty"] = not include_empty_subsets + # if the min degree is above 0, the no set intersection should be hidden + grammar["filters"]["hideNoSet"] = ( + min_degree > 0 if min_degree is not None else False + ) + + grammar["filters"]["minVisible"] = min_degree if min_degree is not None else 0 + grammar["filters"]["maxVisible"] = max_degree if max_degree is not None else 6 + + grammar["visibleSets"] = totals.index.to_list() + + grammar["allSets"] = get_all_sets_info(totals) + + grammar["bookmarkedIntersections"] = [ + # generate intersection ids, or simply append index? + ] + + if include_data: + grammar["processedData"] = generate_processed_data(df, intersections, totals) + grammar["rawData"] = {} + grammar["accessibleProcessedData"] = generate_processed_data( + df, intersections, totals, accessible=True + ) + + return grammar + + +def get_all_sets_info(totals): + """ + Returns a list of objects, each containing the name and size of a set. + + Parameters: + totals : dict + A dictionary containing the set names as keys and their sizes as values. + + Returns: + list: A list of dictionaries, where each dictionary represents a set and contains the keys "name" and "size". + The "name" key holds the name of the set (str), and the "size" key holds the size of the set (int). + """ + all_sets = [] + + for set_name, set_size in totals.items(): + all_sets.append({"name": set_name, "size": set_size}) + + return all_sets + + +def calculate_deviation(contained_sets, v_sets, sets, intersection_size, total_items): + """ + Calculate the deviation of a given intersection. + Based on deviation calculation in 2014 paper by Lex et al. + + Parameters: + contained_sets : list + The list of sets contained in the subset row (intersection) + v_sets : list + The list of all visible sets loaded into the UpSet plot + sets : dict + A dictionary containing the set names as keys and their sizes as values. + intersection_size : int + The size of the subset row (intersection) + total_items : int + The total number of items in the dataset + + Returns: + float: The deviation of the intersection. + """ + contained_product = 1 + for s in contained_sets: + set_size = sets[s] + contained_product *= set_size / total_items + + non_contained_product = 1 + for v in v_sets: + if v not in contained_sets: + set_size = sets[v] + non_contained_product *= 1 - set_size / total_items + + dev = (intersection_size / total_items) - ( + contained_product * non_contained_product + ) + + return dev * 100 + + +def get_set_membership_from_index(intersections, idx): + """ + Returns a dictionary indicating the set membership of a given index. + + Parameters: + intersections : :class:`pandas.core.series.Series` + The list of intersections. + idx : int + The index to retrieve set membership for. + + Returns: + dict: A dictionary where the keys are the set names and the values are either "Yes" or "No" indicating set membership. + """ + names = intersections.index.names + set_membership = {} + for i, name in enumerate(names): + set_membership[name] = "Yes" if intersections.index[idx][i] else "No" + + return set_membership + + +def get_degree_from_set_membership(set_membership): + """ + Returns the degree of a given set membership. + + Parameters: + set_membership : dict + A dictionary indicating the set membership of a given index. + + Returns: + int: The degree of the set membership. + """ + return list(set_membership.values()).count("Yes") + + +def get_element_name_from_id(id): + """ + Returns the element name (for use in alt-txt) from the given ID. + (e.g) "Just cat1" or "cat1, cat2, and cat3" + + Parameters: + id : str + The ID to retrieve the element name for. + """ + # remove "Subset_" + # split the id by _ (this is the default delimiter between set names) + # join with commas, but the last element should also have "and " prepended + # if elements is only one element, return "Just {element}" + stripped_id = id.replace("Subset~_~", "") + elements = stripped_id.split("~_~") + + if len(elements) == 1: + # the empty subset is named "Unincluded" + # and does not need "Just" prepended + if elements[0] == "Unincluded": + return "Unincluded" + return f"Just {elements[0]}" + + element_name = "" + for i, element in enumerate(elements): + if i == len(elements) - 1: + element_name += f"and {element}" + else: + element_name += f"{element}, " + + return element_name + + +def generate_intersection_id(intersections, idx): + """ + Generates an intersection ID based on the given intersections and index. + + Parameters: + intersections : :class:`pandas.core.series.Series` + The list of intersections. + idx : int + The index to retrieve set membership for. + + Returns: + str: The generated intersection ID. + """ + names = intersections.index.names + intersection_id = "Subset" + set_membership = get_set_membership_from_index(intersections, idx) + for name in names: + # the delimiter "~_~" is used in UpSet2 in the internal ID + intersection_id += f"~_~{name}" if set_membership[name] == "Yes" else "" + + # the empty subset is named "Subset_Unincluded" in UpSet2 + if intersection_id == "Subset": + intersection_id += "~_~Unincluded" + + return intersection_id + + +def generate_processed_data(df, intersections, totals, accessible=False): + processedData = {"values": {}, "order": []} + # for every row in intersections: + # generate the setMembership object + for i in range(len(intersections)): + id = generate_intersection_id(intersections, i) + set_membership = get_set_membership_from_index(intersections, i) + contained_sets = [ + name for name, membership in set_membership.items() if membership == "Yes" + ] + + intersection_size = int(intersections.iat[i]) + + deviation = calculate_deviation( + contained_sets=contained_sets, + v_sets=list(totals.index), + sets=totals, + intersection_size=intersection_size, + total_items=totals.sum(), + ) + + processedData["values"][id] = { + "id": id, + "elementName": get_element_name_from_id(id), + "setMembership": set_membership, + "size": intersection_size, + "type": "Subset", + "degree": get_degree_from_set_membership(set_membership), + "attributes": {}, + "deviation": deviation, + } + if accessible: + processedData["values"][id]["deviation"] = deviation + else: + processedData["values"][id]["items"] = [] + processedData["order"].append(id) + + return processedData + + +def fetch_alt_text(grammar): + """ + Get the alt text for an UpSet plot (from upset-alttxt) + """ + try: + parser = Parser(grammar) + parsed_data = parser.get_data() + parsed_grammar = parser.get_grammar() + + tokenmap: TokenMap = TokenMap(parsed_data, parsed_grammar, "title") + + gen = AltTxtGen(Level.DEFAULT, True, tokenmap, parsed_grammar) + except Exception as e: + raise Exception(f"Failed to create alt text generator: {e}") + + try: + return gen.text + except Exception as e: + raise Exception(f"Failed to generate alt text: {e}") diff --git a/upsetplot/plotting.py b/upsetplot/plotting.py index c69636b..95e3391 100644 --- a/upsetplot/plotting.py +++ b/upsetplot/plotting.py @@ -8,6 +8,7 @@ from matplotlib import pyplot as plt from . import util +from .alt_text import fetch_alt_text, generate_grammar from .reformat import _get_subset_mask, query # prevents ImportError on matplotlib versions >3.5.2 @@ -278,6 +279,8 @@ class UpSet: include_empty_subsets : bool (default=False) If True, all possible category combinations will be shown as subsets, even when some are not present in data. + gen_grammar : bool (default=False) + If True, a grammar will be generated for the plot. """ _default_figsize = (10, 6) @@ -306,6 +309,8 @@ def __init__( show_counts="", show_percentages=False, include_empty_subsets=False, + gen_grammar=False, + meta_data=None, ): self._horizontal = orientation == "horizontal" self._reorient = _identity if self._horizontal else _transpose @@ -359,6 +364,45 @@ def __init__( {"facecolor": facecolor} for i in range(len(self.intersections)) ] self.subset_legend = [] # pairs of (style, label) + self._grammar = None + + # pre-generate the grammar + # it is necessary to do this within the init function, as some values (sort_by, sort_categories_by, etc) are not available outside of the plot initialization + if gen_grammar: + self._grammar = generate_grammar( + self._df, + self.intersections, + self.totals, + horizontal=self._horizontal, + sort_by=sort_by, + sort_categories_by=sort_categories_by, + min_degree=min_degree, + max_degree=max_degree, + include_empty_subsets=include_empty_subsets, + include_data=True, + meta_data=meta_data, + ) + + def get_alt_text(self): + """Return a textual description of the plot from upset-alttxt package + + Returns + ------- + dict + A json object with textual descriptions of the plot. + Contains entries: + 'techniqueDescription' (str), + 'shortDescription' (str) + 'longDescription' (str): a markdown formatted string + """ + if self._grammar is None: + raise ValueError("Grammar not generated.") + + try: + return fetch_alt_text(self._grammar) + except Exception as e: + warnings.warn("Failed to fetch alt text: %s" % e) + return {} def _swapaxes(self, x, y): if self._horizontal: @@ -633,6 +677,12 @@ def add_catplot(self, kind, value=None, elements=3, **kw): } ) + # add the category to the grammar (list of visible categories) + if self._grammar is not None: + self._grammar["visibleAttributes"].append(value) + + # attribute stats data needs to be added to every subset + def _check_value(self, value): if value is None and "_value" in self._df.columns: value = "_value" @@ -936,6 +986,39 @@ def make_args(val): else: raise NotImplementedError("unhandled where: %r" % where) + def get_grammar(self): + """Return the grammar dictionary for the plot. + + Returns: + dict: The grammar dictionary for the plot. + """ + return self._grammar + + def _update_grammar(self, key, value): + """ + Update the grammar dictionary with the given key-value pair. + + If the key already exists in the grammar, the value is appended to the existing list. + If the key does not exist, a ValueError is raised. + + Args: + key (str): The key to update in the grammar dictionary. + value (Any): The value to append to the existing list or assign to the key. + + Raises: + ValueError: If the key is not found in the grammar dictionary. + """ + if self._grammar is None: + return + if key in self._grammar: + if isinstance(self.grammar[key], list): + self.grammar[key].append(value) + else: + self.grammar[key] = value + return True + else: + raise ValueError(f"Key {key} not found in grammar") + def plot_totals(self, ax): """Plot bars indicating total set size""" orig_ax = ax diff --git a/upsetplot/tests/test_alttext.py b/upsetplot/tests/test_alttext.py new file mode 100644 index 0000000..06163d2 --- /dev/null +++ b/upsetplot/tests/test_alttext.py @@ -0,0 +1,73 @@ +import pytest + +from upsetplot import UpSet, fetch_alt_text, generate_counts, generate_grammar + + +@pytest.fixture +def sample_data(): + return generate_counts() + + +@pytest.fixture +def test_generate_grammar(sample_data): + upset = UpSet( + sample_data, + subset_size="count", + sort_by="-cardinality", + sort_categories_by="-cardinality", + orientation="vertical", + gen_grammar=True, + ) + + grammar = upset.get_grammar() + assert isinstance(grammar, dict) + assert "version" in grammar + + return grammar + + +def test_generate_grammar_invalid_data(): + with pytest.raises(AttributeError): + generate_grammar( + df=None, + intersections=None, + totals=None, + horizontal=False, + sort_by="degree", + sort_categories_by="cardinality", + min_degree=None, + max_degree=None, + include_empty_subsets=False, + include_data=False, + meta_data=None, + ) + + +def test_generate_grammar_with_empty_subsets(sample_data): + grammar = generate_grammar( + df=sample_data, + intersections=sample_data, + totals=sample_data, + horizontal=False, + sort_by="degree", + sort_categories_by="cardinality", + min_degree=None, + max_degree=None, + include_empty_subsets=True, + include_data=False, + meta_data={"title": "Sample Plot", "caption": "This is a sample plot"}, + ) + assert grammar["filters"]["hideEmpty"] == False + + +def test_fetch_alt_text(test_generate_grammar): + alt_text = fetch_alt_text(test_generate_grammar) + assert isinstance(alt_text, dict) + assert "techniqueDescription" in alt_text + assert "shortDescription" in alt_text + assert "longDescription" in alt_text + + +def test_fetch_alt_text_invalid_grammar(): + with pytest.raises(Exception, match="Failed to create alt text generator"): + fetch_alt_text({}) diff --git a/upsetplot/tests/test_examples.py b/upsetplot/tests/test_examples.py index 61bde51..115b10c 100644 --- a/upsetplot/tests/test_examples.py +++ b/upsetplot/tests/test_examples.py @@ -14,6 +14,7 @@ def test_example(path): pytest.importorskip("sklearn") pytest.importorskip("seaborn") + pytest.importorskip("upset-alttxt") env = os.environ.copy() env["PYTHONPATH"] = os.getcwd() + ":" + env.get("PYTHONPATH", "") subprocess.check_output([sys.executable, path], env=env)