From d18b58679a759ad569ccb29634453bcf2b8b78c2 Mon Sep 17 00:00:00 2001 From: Jake Wagoner Date: Wed, 10 Jan 2024 12:07:13 -0700 Subject: [PATCH 1/4] Add grammar generation (needs data) --- upsetplot/alt_text.py | 121 ++++++++++++++++++++++++++++++++++++++++++ upsetplot/plotting.py | 24 +++++++++ 2 files changed, 145 insertions(+) create mode 100644 upsetplot/alt_text.py diff --git a/upsetplot/alt_text.py b/upsetplot/alt_text.py new file mode 100644 index 0000000..01f6715 --- /dev/null +++ b/upsetplot/alt_text.py @@ -0,0 +1,121 @@ +def generate_grammar( + df, + intersections, + totals, + *, + sort_by, + sort_categories_by, + min_degree=None, + max_degree=None, + include_empty_subsets=False, +): + """ + Generate the grammar used by UpSet 2 and Multinet to generate alt text. + + Some values present in UpSet 2 will not be present in all implementations (e.g. Aggregation), so default values will be provided here + + .. versionadded:: 0.10 + + Parameters + ---------- + df : :class:`pandas.core.frame.DataFrame` + The UpSet data DataFrame. + intersections : :class:`pandas.core.series.Series` + The list of intersections. + totals : :class:`pandas.core.series.Series` + The list of totals. + sort_by : str + The attribute to sort the sets by. + sort_categories_by : str + The attribute to sort the categories by. + min_degree : number, optional + The minimum degree (optional). + max_degree : number, optional + The maximum degree (optional). + include_empty_subsets : bool, default=False + Whether to include empty subsets (default: False). + + Returns + ------- + The generated grammar compatible with UpSet 2 and Multinet's alt-text generator. + """ + + # default grammar state values required by UpSet 2/Multinet + grammar = { + "plotInformation": { + "description": "", + "sets": "", + "items": "" + }, + "firstAggregateBy": "None", + "firstOverlapDegree": 2, + "secondAggregateBy": "None", + "secondOverlapDegree": 2, + "sortVisibleBy": "Alphabetical", + "sortBy": "Size", + "filters": { + "maxVisible": 6, + "minVisible": 0, + "hideEmpty": True, + "hideNoSet": False + }, + "visibleSets": [], + "visibleAttributes": [], + "bookmarkedIntersections": [], + "collapsed": [], + "plots": { + "scatterplots": [], + "histograms": [], + "wordClouds": [] + }, + "allSets": [], + # this value will likely be redundant with the latest alt-text generator release + "altText": { + "verbosity": "low", + "explain": "full" + }, + "rawData": {}, # this value may not be necessary + "processedData": {}, + "accessibleProcessedData": {}, + } + + # TODO: update this when UpSet adds reverese sorting functionality + if sort_by == "cardinality" or sort_by == "-cardinality": + grammar["sortBy"] = "Size" + if sort_by == "degree" or sort_by == "-degree": + grammar["sortBy"] = "Degree" + # this sort type is not supported by UpSet 2 + if sort_by == "input" or sort_by == "-input": + grammar["sortBy"] = "Size" + + if sort_categories_by == "cardinality": + grammar["sortVisibleBy"] = "Descending" + if sort_categories_by == "-cardinality": + grammar["sortVisibleBy"] = "Ascending" + # this sort type is not supported by UpSet 2 + if sort_categories_by == "input" or sort_categories_by == "-input": + grammar["sortVisibleBy"] = "Alphabetical" + + grammar["filters"]["hideEmpty"] = not include_empty_subsets + grammar["filters"]["minVisible"] = ( + min_degree if min_degree is not None else 0 + ) + grammar["filters"]["maxVisible"] = ( + max_degree if max_degree is not None else 6 + ) + + # these two values are the same as there is no way to "hide" sets + grammar["visibleSets"] = totals.index.to_list() + grammar["allSets"] = totals.index.to_list() + + return grammar + + +def get_alt_text(): + """ + Get the alt text for an UpSet plot. + + Returns: + The alt text for the plot. + """ + pass diff --git a/upsetplot/plotting.py b/upsetplot/plotting.py index c69636b..48b4e54 100644 --- a/upsetplot/plotting.py +++ b/upsetplot/plotting.py @@ -10,6 +10,8 @@ from . import util from .reformat import _get_subset_mask, query +from .alt_text import generate_grammar + # prevents ImportError on matplotlib versions >3.5.2 try: from matplotlib.tight_layout import get_renderer @@ -360,6 +362,25 @@ def __init__( ] self.subset_legend = [] # pairs of (style, label) + self.grammar = generate_grammar( + self._df, + self.intersections, + self.totals, + sort_by=sort_by, + sort_categories_by=sort_categories_by, + # these attributes are not present in UpSet 2 + # subset_size=subset_size, + # sum_over=sum_over, + # min_subset_size=min_subset_size, + # max_subset_size=max_subset_size, + # max_subset_rank=max_subset_rank, + min_degree=min_degree, + max_degree=max_degree, + # this attribute is not present in UpSet 2 + # reverse=not self._horizontal, + include_empty_subsets=include_empty_subsets, + ) + def _swapaxes(self, x, y): if self._horizontal: return x, y @@ -633,6 +654,9 @@ def add_catplot(self, kind, value=None, elements=3, **kw): } ) + # add the category to the list of visible categories + self.grammar['visibleAttributes'].append(value) + def _check_value(self, value): if value is None and "_value" in self._df.columns: value = "_value" From d82c4fa5bbfd17c460d5ebc767c9cb37511b81fe Mon Sep 17 00:00:00 2001 From: Jake Wagoner Date: Thu, 22 Aug 2024 14:34:13 -0600 Subject: [PATCH 2/4] Add alt-text generation from upset-alttxt --- README.rst | 1 + doc/requirements.txt | 1 + examples/plot_alt_text.py | 73 +++++++++ upsetplot/alt_text.py | 302 ++++++++++++++++++++++++++++++++++---- upsetplot/plotting.py | 128 +++++++++++----- 5 files changed, 445 insertions(+), 60 deletions(-) create mode 100644 examples/plot_alt_text.py diff --git a/README.rst b/README.rst index e56155e..49633b3 100644 --- a/README.rst +++ b/README.rst @@ -130,6 +130,7 @@ Installation requires: * pandas * matplotlib >= 2.0 * seaborn to use `UpSet.add_catplot` +* upset-alttxt v0.4.3 to use `UpSet.get_alt_text` It should then be possible to:: diff --git a/doc/requirements.txt b/doc/requirements.txt index aa52f7d..b866391 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -10,3 +10,4 @@ scikit-learn nbsphinx sphinx<2 sphinx-rtd-theme +upset-alttxt==0.2.7 diff --git a/examples/plot_alt_text.py b/examples/plot_alt_text.py new file mode 100644 index 0000000..15590c9 --- /dev/null +++ b/examples/plot_alt_text.py @@ -0,0 +1,73 @@ +""" +========================================== +Data Vis: Alt text generation in UpSetPlot +========================================== + +Explore text description generation via upset-alttxt (2024). + +When text description generation is enabled, there are no changes to the actual plot. +The generated text description can be accessed after creating the UpSet plot object. + +""" + +from matplotlib import pyplot as plt +from upsetplot import generate_counts +from upsetplot import UpSet + +# Load the dataset into a DataFrame +example = generate_counts() + +########################################################################## + +print("Generating a plot AND grammar for textual description") +upset = UpSet( + example, + subset_size="count", + sort_by="-cardinality", + sort_categories_by="-cardinality", + orientation="vertical", + gen_grammar=True, + meta_data={"items": "RANDOM ITEMS"}, +) +upset.plot() +plt.suptitle("UpSet plot with text description generated'") +text_description = upset.get_alt_text() + +print('==================================') +print('Long Description (markdown formatted)') +print('==================================') +print(text_description['longDescription']) + +print('==================================') +print('Short Description') +print('==================================') +print(text_description['shortDescription']) + +print('\n==================================') +print('Technique Description') +print('==================================') +print(text_description['techniqueDescription']) + +plt.show() + + +print('\nNow to generate the same plot with no alt text generation') + +# To disable grammar generation, simply ignore the gen_grammar parameter or set it to False. +upset = UpSet( + example, + subset_size="count", + sort_by="-cardinality", + sort_categories_by="-cardinality", + orientation="vertical", +) + +upset.plot() +plt.suptitle("UpSet plot with no alt text generation") + +try: + text_description = upset.get_alt_text() +except ValueError: + print('gen_grammar must be set to True for any alt text generation.') + +plt.show() diff --git a/upsetplot/alt_text.py b/upsetplot/alt_text.py index 01f6715..d1657ec 100644 --- a/upsetplot/alt_text.py +++ b/upsetplot/alt_text.py @@ -1,13 +1,31 @@ +from alttxt.enums import Level +from alttxt.generator import AltTxtGen +from alttxt.tokenmap import TokenMap +from alttxt.parser import Parser + + +""" + alt_text.py + ----------- + This file contains various conversions from the datatypes, structures, + and values from UpSetPlot to Upset2 at upset.multinet.app. + This is necessary to generate alt text with the Multinet API. +""" + + def generate_grammar( df, intersections, totals, *, + horizontal, sort_by, sort_categories_by, min_degree=None, max_degree=None, include_empty_subsets=False, + include_data=False, + meta_data=None, ): """ Generate the grammar used by UpSet 2 and Multinet to generate alt text. @@ -24,6 +42,8 @@ def generate_grammar( The list of intersections. totals : :class:`pandas.core.series.Series` The list of totals. + horizontal: bool + Plot orientation. sort_by : str The attribute to sort the sets by. sort_categories_by : str @@ -34,19 +54,24 @@ def generate_grammar( The maximum degree (optional). include_empty_subsets : bool, default=False Whether to include empty subsets (default: False). + include_data : bool, default=False + Whether to include subset data (default: False). Returns ------- - The generated grammar compatible with UpSet 2 and Multinet's alt-text generator. + The generated grammar as a dictionary. """ # default grammar state values required by UpSet 2/Multinet grammar = { - "plotInformation": { + "plotInformation": { + "title": "", + "caption": "", "description": "", "sets": "", "items": "" }, + "horizontal": False, "firstAggregateBy": "None", "firstOverlapDegree": 2, "secondAggregateBy": "None", @@ -57,46 +82,56 @@ def generate_grammar( "maxVisible": 6, "minVisible": 0, "hideEmpty": True, - "hideNoSet": False + "hideNoSet": False, }, "visibleSets": [], "visibleAttributes": [], "bookmarkedIntersections": [], "collapsed": [], - "plots": { - "scatterplots": [], - "histograms": [], - "wordClouds": [] - }, + "plots": {"scatterplots": [], "histograms": [], "wordClouds": []}, "allSets": [], - # this value will likely be redundant with the latest alt-text generator release - "altText": { - "verbosity": "low", - "explain": "full" - }, - "rawData": {}, # this value may not be necessary - "processedData": {}, - "accessibleProcessedData": {}, } - # TODO: update this when UpSet adds reverese sorting functionality - if sort_by == "cardinality" or sort_by == "-cardinality": - grammar["sortBy"] = "Size" - if sort_by == "degree" or sort_by == "-degree": + grammar["horizontal"] = horizontal + + if meta_data is not None: + grammar["plotInformation"]["title"] = meta_data.get("title", "") + grammar["plotInformation"]["caption"] = meta_data.get("caption", "") + grammar["plotInformation"]["description"] = meta_data.get("description", "") + grammar["plotInformation"]["sets"] = meta_data.get("sets", "") + grammar["plotInformation"]["items"] = meta_data.get("items", "") + + if sort_by == "degree": grammar["sortBy"] = "Degree" + grammar["sortByOrder"] = "Descending" + if sort_by == "-degree": + grammar["sortBy"] = "Degree" + grammar["sortByOrder"] = "Ascending" + if sort_by == "cardinality": + grammar["sortBy"] = "Size" + grammar["sortByOrder"] = "Ascending" + if sort_by == "-cardinality": + grammar["sortBy"] = "Size" + grammar["sortByOrder"] = "Descending" # this sort type is not supported by UpSet 2 if sort_by == "input" or sort_by == "-input": grammar["sortBy"] = "Size" + grammar["sortByOrder"] = "Descending" if sort_categories_by == "cardinality": grammar["sortVisibleBy"] = "Descending" if sort_categories_by == "-cardinality": grammar["sortVisibleBy"] = "Ascending" - # this sort type is not supported by UpSet 2 + # this category sort type is not supported by UpSet 2 if sort_categories_by == "input" or sort_categories_by == "-input": grammar["sortVisibleBy"] = "Alphabetical" grammar["filters"]["hideEmpty"] = not include_empty_subsets + # if the min degree is above 0, the no set intersection should be hidden + grammar["filters"]["hideNoSet"] = ( + min_degree > 0 if min_degree is not None else False + ) + grammar["filters"]["minVisible"] = ( min_degree if min_degree is not None else 0 ) @@ -104,18 +139,233 @@ def generate_grammar( max_degree if max_degree is not None else 6 ) - # these two values are the same as there is no way to "hide" sets grammar["visibleSets"] = totals.index.to_list() - grammar["allSets"] = totals.index.to_list() + + grammar["allSets"] = get_all_sets_info(totals) + + grammar["bookmarkedIntersections"] = [ + # generate intersection ids, or simply append index? + ] + + if (include_data): + grammar["processedData"] = generate_processed_data( + df, intersections, totals + ) + grammar["rawData"] = {} + grammar["accessibleProcessedData"] = generate_processed_data( + df, intersections, totals, accessible=True + ) return grammar -def get_alt_text(): +def get_all_sets_info(totals): + """ + Returns a list of objects, each containing the name and size of a set. + + Parameters: + totals : dict + A dictionary containing the set names as keys and their sizes as values. + + Returns: + list: A list of dictionaries, where each dictionary represents a set and contains the keys "name" and "size". + The "name" key holds the name of the set (str), and the "size" key holds the size of the set (int). + """ + all_sets = [] + + for set_name, set_size in totals.items(): + all_sets.append({"name": set_name, "size": set_size}) + + return all_sets + + +def calculate_deviation( + contained_sets, + v_sets, + sets, + intersection_size, + total_items +): + """ + Calculate the deviation of a given intersection. + Based on deviation calculation in 2014 paper by Lex et al. + + Parameters: + contained_sets : list + The list of sets contained in the subset row (intersection) + v_sets : list + The list of all visible sets loaded into the UpSet plot + sets : dict + A dictionary containing the set names as keys and their sizes as values. + intersection_size : int + The size of the subset row (intersection) + total_items : int + The total number of items in the dataset + + Returns: + float: The deviation of the intersection. + """ + contained_product = 1 + for s in contained_sets: + set_size = sets[s] + contained_product *= set_size / total_items + + non_contained_product = 1 + for v in v_sets: + if v not in contained_sets: + set_size = sets[v] + non_contained_product *= 1 - set_size / total_items + + dev = intersection_size / total_items - contained_product * non_contained_product + + return dev * 100 + + +def get_set_membership_from_index(intersections, idx): + """ + Returns a dictionary indicating the set membership of a given index. + + Parameters: + intersections : :class:`pandas.core.series.Series` + The list of intersections. + idx : int + The index to retrieve set membership for. + + Returns: + dict: A dictionary where the keys are the set names and the values are either "Yes" or "No" indicating set membership. """ - Get the alt text for an UpSet plot. + names = intersections.index.names + set_membership = {} + for i, name in enumerate(names): + set_membership[name] = "Yes" if intersections.index[idx][i] else "No" + + return set_membership + + +def get_degree_from_set_membership(set_membership): + """ + Returns the degree of a given set membership. + + Parameters: + set_membership : dict + A dictionary indicating the set membership of a given index. + + Returns: + int: The degree of the set membership. + """ + return list(set_membership.values()).count("Yes") + + +def get_element_name_from_id(id): + """ + Returns the element name (for use in alt-txt) from the given ID. + (e.g) "Just cat1" or "cat1, cat2, and cat3" + + Parameters: + id : str + The ID to retrieve the element name for. + """ + # remove "Subset_" + # split the id by _ (this is the default delimiter between set names) + # join with commas, but the last element should also have "and " prepended + # if elements is only one element, return "Just {element}" + stripped_id = id.replace("Subset~_~", "") + elements = stripped_id.split("~_~") + + if len(elements) == 1: + # the empty subset is named "Unincluded" + # and does not need "Just" prepended + if (elements[0] == "Unincluded"): + return "Unincluded" + return f"Just {elements[0]}" + + element_name = "" + for i, element in enumerate(elements): + if i == len(elements) - 1: + element_name += f"and {element}" + else: + element_name += f"{element}, " + + return element_name + + +def generate_intersection_id(intersections, idx): + """ + Generates an intersection ID based on the given intersections and index. + + Parameters: + intersections : :class:`pandas.core.series.Series` + The list of intersections. + idx : int + The index to retrieve set membership for. + + Returns: + str: The generated intersection ID. + """ + names = intersections.index.names + intersection_id = "Subset" + set_membership = get_set_membership_from_index(intersections, idx) + for name in names: + # the delimiter "~_~" is used here as it is unlikely that it will be used in a set name + intersection_id += f"~_~{name}" if set_membership[name] == "Yes" else "" + + # the empty subset is named "Subset_Unincluded" in UpSet2 + if (intersection_id == "Subset"): + intersection_id += "~_~Unincluded" + + return intersection_id + + +def generate_processed_data(df, intersections, totals, accessible=False): + processedData = {"values": {}, "order": []} + # for every row in intersections: + # generate the setMembership object + for i in range(len(intersections)): + id = generate_intersection_id(intersections, i) + set_membership = get_set_membership_from_index(intersections, i) + contained_sets = [name for name, membership in set_membership.items() if membership == "Yes"] + + intersection_size = int(intersections.iat[i]) + + deviation = calculate_deviation( + contained_sets=contained_sets, + v_sets=list(totals.index), + sets=totals, + intersection_size=intersection_size, + total_items=totals.sum(), + ) + + processedData["values"][id] = { + "id": id, + "elementName": get_element_name_from_id(id), + "setMembership": set_membership, + "size": intersection_size, + "type": "Subset", + "degree": get_degree_from_set_membership(set_membership), + "attributes": {}, + "deviation": deviation, + } + if accessible: + processedData["values"][id]["deviation"] = deviation + else: + processedData["values"][id]["items"] = [] + processedData["order"].append(id) + + return processedData + + +def fetch_alt_text(grammar): + """ + Get the alt text for an UpSet plot. Calls Multinet API Returns: The alt text for the plot. """ - pass + parser = Parser(grammar) + parsed_data = parser.get_data() + parsed_grammar = parser.get_grammar() + + tokenmap: TokenMap = TokenMap(parsed_data, parsed_grammar, 'title') + gen = AltTxtGen(Level.DEFAULT, True, tokenmap, parsed_grammar) + + return gen.text diff --git a/upsetplot/plotting.py b/upsetplot/plotting.py index 48b4e54..965213a 100644 --- a/upsetplot/plotting.py +++ b/upsetplot/plotting.py @@ -10,7 +10,7 @@ from . import util from .reformat import _get_subset_mask, query -from .alt_text import generate_grammar +from .alt_text import fetch_alt_text, generate_grammar # prevents ImportError on matplotlib versions >3.5.2 try: @@ -280,6 +280,8 @@ class UpSet: include_empty_subsets : bool (default=False) If True, all possible category combinations will be shown as subsets, even when some are not present in data. + gen_grammar : bool (default=False) + If True, a grammar will be generated for the plot. """ _default_figsize = (10, 6) @@ -308,6 +310,8 @@ def __init__( show_counts="", show_percentages=False, include_empty_subsets=False, + gen_grammar=False, + meta_data=None, ): self._horizontal = orientation == "horizontal" self._reorient = _identity if self._horizontal else _transpose @@ -342,44 +346,64 @@ def __init__( self._show_counts = show_counts self._show_percentages = show_percentages - (self.total, self._df, self.intersections, self.totals) = _process_data( - data, - sort_by=sort_by, - sort_categories_by=sort_categories_by, - subset_size=subset_size, - sum_over=sum_over, - min_subset_size=min_subset_size, - max_subset_size=max_subset_size, - max_subset_rank=max_subset_rank, - min_degree=min_degree, - max_degree=max_degree, - reverse=not self._horizontal, - include_empty_subsets=include_empty_subsets, + (self.total, self._df, self.intersections, self.totals) = ( + _process_data( + data, + sort_by=sort_by, + sort_categories_by=sort_categories_by, + subset_size=subset_size, + sum_over=sum_over, + min_subset_size=min_subset_size, + max_subset_size=max_subset_size, + max_subset_rank=max_subset_rank, + min_degree=min_degree, + max_degree=max_degree, + reverse=not self._horizontal, + include_empty_subsets=include_empty_subsets, + ) ) self.category_styles = {} self.subset_styles = [ {"facecolor": facecolor} for i in range(len(self.intersections)) ] self.subset_legend = [] # pairs of (style, label) + self._grammar = None + + if (gen_grammar): + self._grammar = generate_grammar( + self._df, + self.intersections, + self.totals, + horizontal=self._horizontal, + sort_by=sort_by, + sort_categories_by=sort_categories_by, + min_degree=min_degree, + max_degree=max_degree, + include_empty_subsets=include_empty_subsets, + include_data=True, + meta_data=meta_data, + ) - self.grammar = generate_grammar( - self._df, - self.intersections, - self.totals, - sort_by=sort_by, - sort_categories_by=sort_categories_by, - # these attributes are not present in UpSet 2 - # subset_size=subset_size, - # sum_over=sum_over, - # min_subset_size=min_subset_size, - # max_subset_size=max_subset_size, - # max_subset_rank=max_subset_rank, - min_degree=min_degree, - max_degree=max_degree, - # this attribute is not present in UpSet 2 - # reverse=not self._horizontal, - include_empty_subsets=include_empty_subsets, - ) + def get_alt_text(self): + """Return a textual description of the plot from upset-alttxt + + Returns + ------- + dict + A json object with textual descriptions of the plot. + Contains entries: + 'techniqueDescription' (str), + 'shortDescription' (str) + 'longDescription' (str): a markdown formatted string + """ + if (self._grammar is None): + raise ValueError("Grammar not generated.") + + try: + return fetch_alt_text(self._grammar) + except Exception as e: + warnings.warn("Failed to fetch alt text: %s" % e) + return {} def _swapaxes(self, x, y): if self._horizontal: @@ -654,8 +678,11 @@ def add_catplot(self, kind, value=None, elements=3, **kw): } ) - # add the category to the list of visible categories - self.grammar['visibleAttributes'].append(value) + # add the category to the grammar (list of visible categories) + if self._grammar is not None: + self._grammar['visibleAttributes'].append(value) + + # attribute stats data needs to be added to every subset def _check_value(self, value): if value is None and "_value" in self._df.columns: @@ -960,6 +987,39 @@ def make_args(val): else: raise NotImplementedError("unhandled where: %r" % where) + def get_grammar(self): + """Return the grammar dictionary for the plot. + + Returns: + dict: The grammar dictionary for the plot. + """ + return self._grammar + + def _update_grammar(self, key, value): + """ + Update the grammar dictionary with the given key-value pair. + + If the key already exists in the grammar, the value is appended to the existing list. + If the key does not exist, a ValueError is raised. + + Args: + key (str): The key to update in the grammar dictionary. + value (Any): The value to append to the existing list or assign to the key. + + Raises: + ValueError: If the key is not found in the grammar dictionary. + """ + if self._grammar is None: + return + if key in self._grammar: + if isinstance(self.grammar[key], list): + self.grammar[key].append(value) + else: + self.grammar[key] = value + return True + else: + raise ValueError(f"Key {key} not found in grammar") + def plot_totals(self, ax): """Plot bars indicating total set size""" orig_ax = ax From fc60bceabf77d91b7aa7103bba681aab7d1b5dbd Mon Sep 17 00:00:00 2001 From: Jake Wagoner Date: Thu, 19 Dec 2024 12:30:45 -0700 Subject: [PATCH 3/4] Improve debug statements for alt-text gen and add version number to alt-text grammar --- README.rst | 2 +- examples/plot_alt_text.py | 1 - upsetplot/alt_text.py | 26 +++++++++++++++++--------- upsetplot/plotting.py | 2 +- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/README.rst b/README.rst index 49633b3..492e435 100644 --- a/README.rst +++ b/README.rst @@ -130,7 +130,7 @@ Installation requires: * pandas * matplotlib >= 2.0 * seaborn to use `UpSet.add_catplot` -* upset-alttxt v0.4.3 to use `UpSet.get_alt_text` +* upset-alttxt v0.4.9 to use `UpSet.get_alt_text` It should then be possible to:: diff --git a/examples/plot_alt_text.py b/examples/plot_alt_text.py index 15590c9..b9d6140 100644 --- a/examples/plot_alt_text.py +++ b/examples/plot_alt_text.py @@ -27,7 +27,6 @@ sort_categories_by="-cardinality", orientation="vertical", gen_grammar=True, - meta_data={"items": "RANDOM ITEMS"}, ) upset.plot() plt.suptitle("UpSet plot with text description generated'") diff --git a/upsetplot/alt_text.py b/upsetplot/alt_text.py index d1657ec..9657555 100644 --- a/upsetplot/alt_text.py +++ b/upsetplot/alt_text.py @@ -64,6 +64,7 @@ def generate_grammar( # default grammar state values required by UpSet 2/Multinet grammar = { + "version": "0.1.0", # alt-text grammar version "plotInformation": { "title": "", "caption": "", @@ -86,7 +87,7 @@ def generate_grammar( }, "visibleSets": [], "visibleAttributes": [], - "bookmarkedIntersections": [], + "bookmarks": [], "collapsed": [], "plots": {"scatterplots": [], "histograms": [], "wordClouds": []}, "allSets": [], @@ -216,7 +217,7 @@ def calculate_deviation( set_size = sets[v] non_contained_product *= 1 - set_size / total_items - dev = intersection_size / total_items - contained_product * non_contained_product + dev = (intersection_size / total_items) - (contained_product * non_contained_product) return dev * 100 @@ -306,7 +307,7 @@ def generate_intersection_id(intersections, idx): intersection_id = "Subset" set_membership = get_set_membership_from_index(intersections, idx) for name in names: - # the delimiter "~_~" is used here as it is unlikely that it will be used in a set name + # the delimiter "~_~" is used in UpSet2 in the internal ID intersection_id += f"~_~{name}" if set_membership[name] == "Yes" else "" # the empty subset is named "Subset_Unincluded" in UpSet2 @@ -361,11 +362,18 @@ def fetch_alt_text(grammar): Returns: The alt text for the plot. """ - parser = Parser(grammar) - parsed_data = parser.get_data() - parsed_grammar = parser.get_grammar() + try: + parser = Parser(grammar) + parsed_data = parser.get_data() + parsed_grammar = parser.get_grammar() - tokenmap: TokenMap = TokenMap(parsed_data, parsed_grammar, 'title') - gen = AltTxtGen(Level.DEFAULT, True, tokenmap, parsed_grammar) + tokenmap: TokenMap = TokenMap(parsed_data, parsed_grammar, 'title') - return gen.text + gen = AltTxtGen(Level.DEFAULT, True, tokenmap, parsed_grammar) + except Exception as e: + raise Exception(f"Failed to create alt text generator: {e}") + + try: + return gen.text + except Exception as e: + raise Exception(f"Failed to generate alt text: {e}") diff --git a/upsetplot/plotting.py b/upsetplot/plotting.py index 965213a..b70e93c 100644 --- a/upsetplot/plotting.py +++ b/upsetplot/plotting.py @@ -385,7 +385,7 @@ def __init__( ) def get_alt_text(self): - """Return a textual description of the plot from upset-alttxt + """Return a textual description of the plot from upset-alttxt package Returns ------- From 7bcf41d2ffbcf741d56a47d42c7b2299b70e3d4d Mon Sep 17 00:00:00 2001 From: Jake Wagoner Date: Thu, 19 Dec 2024 13:48:03 -0700 Subject: [PATCH 4/4] Add testing coverage for alt-text --- .github/workflows/test.yml | 1 + README.rst | 2 +- doc/requirements.txt | 2 +- examples/plot_alt_text.py | 34 +++++++-------- upsetplot/__init__.py | 6 +++ upsetplot/alt_text.py | 58 ++++++++++--------------- upsetplot/plotting.py | 61 +++++++++++++------------- upsetplot/tests/test_alttext.py | 73 ++++++++++++++++++++++++++++++++ upsetplot/tests/test_examples.py | 1 + 9 files changed, 153 insertions(+), 85 deletions(-) create mode 100644 upsetplot/tests/test_alttext.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f146d20..5e757c6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -20,6 +20,7 @@ jobs: shell: bash -el {0} run: | conda install pytest pytest-cov coveralls ${{ matrix.conda-deps }} + pip install upset-alttxt==0.5.2 python setup.py install cp ci/matplotlibrc matplotlibrc - name: test diff --git a/README.rst b/README.rst index 492e435..2902314 100644 --- a/README.rst +++ b/README.rst @@ -130,7 +130,7 @@ Installation requires: * pandas * matplotlib >= 2.0 * seaborn to use `UpSet.add_catplot` -* upset-alttxt v0.4.9 to use `UpSet.get_alt_text` +* upset-alttxt v0.5.2 to use `UpSet.get_alt_text` It should then be possible to:: diff --git a/doc/requirements.txt b/doc/requirements.txt index b866391..ae049f7 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -10,4 +10,4 @@ scikit-learn nbsphinx sphinx<2 sphinx-rtd-theme -upset-alttxt==0.2.7 +upset-alttxt diff --git a/examples/plot_alt_text.py b/examples/plot_alt_text.py index b9d6140..e2dd6fa 100644 --- a/examples/plot_alt_text.py +++ b/examples/plot_alt_text.py @@ -3,7 +3,7 @@ Data Vis: Alt text generation in UpSetPlot ========================================== -Explore text description generation via upset-alttxt (2024). +Explore text description generation via upset-alttxt (2025). When text description generation is enabled, there are no changes to the actual plot. The generated text description can be accessed after creating the UpSet plot object. @@ -11,8 +11,8 @@ """ from matplotlib import pyplot as plt -from upsetplot import generate_counts -from upsetplot import UpSet + +from upsetplot import UpSet, generate_counts # Load the dataset into a DataFrame example = generate_counts() @@ -32,25 +32,25 @@ plt.suptitle("UpSet plot with text description generated'") text_description = upset.get_alt_text() -print('==================================') -print('Long Description (markdown formatted)') -print('==================================') -print(text_description['longDescription']) +print("==================================") +print("Long Description (markdown formatted)") +print("==================================") +print(text_description["longDescription"]) -print('==================================') -print('Short Description') -print('==================================') -print(text_description['shortDescription']) +print("==================================") +print("Short Description") +print("==================================") +print(text_description["shortDescription"]) -print('\n==================================') -print('Technique Description') -print('==================================') -print(text_description['techniqueDescription']) +print("\n==================================") +print("Technique Description") +print("==================================") +print(text_description["techniqueDescription"]) plt.show() -print('\nNow to generate the same plot with no alt text generation') +print("\nNow to generate the same plot with no alt text generation") # To disable grammar generation, simply ignore the gen_grammar parameter or set it to False. upset = UpSet( @@ -67,6 +67,6 @@ try: text_description = upset.get_alt_text() except ValueError: - print('gen_grammar must be set to True for any alt text generation.') + print("gen_grammar must be set to True for any alt text generation.") plt.show() diff --git a/upsetplot/__init__.py b/upsetplot/__init__.py index 4cb4364..07d9955 100644 --- a/upsetplot/__init__.py +++ b/upsetplot/__init__.py @@ -3,6 +3,10 @@ import os if os.environ.get("__IN-SETUP", None) != "1": + from .alt_text import ( + fetch_alt_text, + generate_grammar, + ) from .data import ( from_contents, from_indicators, @@ -24,4 +28,6 @@ "from_contents", "from_indicators", "query", + "generate_grammar", + "fetch_alt_text", ] diff --git a/upsetplot/alt_text.py b/upsetplot/alt_text.py index 9657555..ab2ca23 100644 --- a/upsetplot/alt_text.py +++ b/upsetplot/alt_text.py @@ -1,8 +1,7 @@ from alttxt.enums import Level from alttxt.generator import AltTxtGen -from alttxt.tokenmap import TokenMap from alttxt.parser import Parser - +from alttxt.tokenmap import TokenMap """ alt_text.py @@ -70,7 +69,7 @@ def generate_grammar( "caption": "", "description": "", "sets": "", - "items": "" + "items": "", }, "horizontal": False, "firstAggregateBy": "None", @@ -133,12 +132,8 @@ def generate_grammar( min_degree > 0 if min_degree is not None else False ) - grammar["filters"]["minVisible"] = ( - min_degree if min_degree is not None else 0 - ) - grammar["filters"]["maxVisible"] = ( - max_degree if max_degree is not None else 6 - ) + grammar["filters"]["minVisible"] = min_degree if min_degree is not None else 0 + grammar["filters"]["maxVisible"] = max_degree if max_degree is not None else 6 grammar["visibleSets"] = totals.index.to_list() @@ -148,10 +143,8 @@ def generate_grammar( # generate intersection ids, or simply append index? ] - if (include_data): - grammar["processedData"] = generate_processed_data( - df, intersections, totals - ) + if include_data: + grammar["processedData"] = generate_processed_data(df, intersections, totals) grammar["rawData"] = {} grammar["accessibleProcessedData"] = generate_processed_data( df, intersections, totals, accessible=True @@ -180,13 +173,7 @@ def get_all_sets_info(totals): return all_sets -def calculate_deviation( - contained_sets, - v_sets, - sets, - intersection_size, - total_items -): +def calculate_deviation(contained_sets, v_sets, sets, intersection_size, total_items): """ Calculate the deviation of a given intersection. Based on deviation calculation in 2014 paper by Lex et al. @@ -217,7 +204,9 @@ def calculate_deviation( set_size = sets[v] non_contained_product *= 1 - set_size / total_items - dev = (intersection_size / total_items) - (contained_product * non_contained_product) + dev = (intersection_size / total_items) - ( + contained_product * non_contained_product + ) return dev * 100 @@ -276,7 +265,7 @@ def get_element_name_from_id(id): if len(elements) == 1: # the empty subset is named "Unincluded" # and does not need "Just" prepended - if (elements[0] == "Unincluded"): + if elements[0] == "Unincluded": return "Unincluded" return f"Just {elements[0]}" @@ -311,7 +300,7 @@ def generate_intersection_id(intersections, idx): intersection_id += f"~_~{name}" if set_membership[name] == "Yes" else "" # the empty subset is named "Subset_Unincluded" in UpSet2 - if (intersection_id == "Subset"): + if intersection_id == "Subset": intersection_id += "~_~Unincluded" return intersection_id @@ -324,17 +313,19 @@ def generate_processed_data(df, intersections, totals, accessible=False): for i in range(len(intersections)): id = generate_intersection_id(intersections, i) set_membership = get_set_membership_from_index(intersections, i) - contained_sets = [name for name, membership in set_membership.items() if membership == "Yes"] + contained_sets = [ + name for name, membership in set_membership.items() if membership == "Yes" + ] intersection_size = int(intersections.iat[i]) deviation = calculate_deviation( - contained_sets=contained_sets, - v_sets=list(totals.index), - sets=totals, - intersection_size=intersection_size, - total_items=totals.sum(), - ) + contained_sets=contained_sets, + v_sets=list(totals.index), + sets=totals, + intersection_size=intersection_size, + total_items=totals.sum(), + ) processedData["values"][id] = { "id": id, @@ -357,17 +348,14 @@ def generate_processed_data(df, intersections, totals, accessible=False): def fetch_alt_text(grammar): """ - Get the alt text for an UpSet plot. Calls Multinet API - - Returns: - The alt text for the plot. + Get the alt text for an UpSet plot (from upset-alttxt) """ try: parser = Parser(grammar) parsed_data = parser.get_data() parsed_grammar = parser.get_grammar() - tokenmap: TokenMap = TokenMap(parsed_data, parsed_grammar, 'title') + tokenmap: TokenMap = TokenMap(parsed_data, parsed_grammar, "title") gen = AltTxtGen(Level.DEFAULT, True, tokenmap, parsed_grammar) except Exception as e: diff --git a/upsetplot/plotting.py b/upsetplot/plotting.py index b70e93c..95e3391 100644 --- a/upsetplot/plotting.py +++ b/upsetplot/plotting.py @@ -8,9 +8,8 @@ from matplotlib import pyplot as plt from . import util -from .reformat import _get_subset_mask, query - from .alt_text import fetch_alt_text, generate_grammar +from .reformat import _get_subset_mask, query # prevents ImportError on matplotlib versions >3.5.2 try: @@ -346,21 +345,19 @@ def __init__( self._show_counts = show_counts self._show_percentages = show_percentages - (self.total, self._df, self.intersections, self.totals) = ( - _process_data( - data, - sort_by=sort_by, - sort_categories_by=sort_categories_by, - subset_size=subset_size, - sum_over=sum_over, - min_subset_size=min_subset_size, - max_subset_size=max_subset_size, - max_subset_rank=max_subset_rank, - min_degree=min_degree, - max_degree=max_degree, - reverse=not self._horizontal, - include_empty_subsets=include_empty_subsets, - ) + (self.total, self._df, self.intersections, self.totals) = _process_data( + data, + sort_by=sort_by, + sort_categories_by=sort_categories_by, + subset_size=subset_size, + sum_over=sum_over, + min_subset_size=min_subset_size, + max_subset_size=max_subset_size, + max_subset_rank=max_subset_rank, + min_degree=min_degree, + max_degree=max_degree, + reverse=not self._horizontal, + include_empty_subsets=include_empty_subsets, ) self.category_styles = {} self.subset_styles = [ @@ -369,19 +366,21 @@ def __init__( self.subset_legend = [] # pairs of (style, label) self._grammar = None - if (gen_grammar): + # pre-generate the grammar + # it is necessary to do this within the init function, as some values (sort_by, sort_categories_by, etc) are not available outside of the plot initialization + if gen_grammar: self._grammar = generate_grammar( - self._df, - self.intersections, - self.totals, - horizontal=self._horizontal, - sort_by=sort_by, - sort_categories_by=sort_categories_by, - min_degree=min_degree, - max_degree=max_degree, - include_empty_subsets=include_empty_subsets, - include_data=True, - meta_data=meta_data, + self._df, + self.intersections, + self.totals, + horizontal=self._horizontal, + sort_by=sort_by, + sort_categories_by=sort_categories_by, + min_degree=min_degree, + max_degree=max_degree, + include_empty_subsets=include_empty_subsets, + include_data=True, + meta_data=meta_data, ) def get_alt_text(self): @@ -396,7 +395,7 @@ def get_alt_text(self): 'shortDescription' (str) 'longDescription' (str): a markdown formatted string """ - if (self._grammar is None): + if self._grammar is None: raise ValueError("Grammar not generated.") try: @@ -680,7 +679,7 @@ def add_catplot(self, kind, value=None, elements=3, **kw): # add the category to the grammar (list of visible categories) if self._grammar is not None: - self._grammar['visibleAttributes'].append(value) + self._grammar["visibleAttributes"].append(value) # attribute stats data needs to be added to every subset diff --git a/upsetplot/tests/test_alttext.py b/upsetplot/tests/test_alttext.py new file mode 100644 index 0000000..06163d2 --- /dev/null +++ b/upsetplot/tests/test_alttext.py @@ -0,0 +1,73 @@ +import pytest + +from upsetplot import UpSet, fetch_alt_text, generate_counts, generate_grammar + + +@pytest.fixture +def sample_data(): + return generate_counts() + + +@pytest.fixture +def test_generate_grammar(sample_data): + upset = UpSet( + sample_data, + subset_size="count", + sort_by="-cardinality", + sort_categories_by="-cardinality", + orientation="vertical", + gen_grammar=True, + ) + + grammar = upset.get_grammar() + assert isinstance(grammar, dict) + assert "version" in grammar + + return grammar + + +def test_generate_grammar_invalid_data(): + with pytest.raises(AttributeError): + generate_grammar( + df=None, + intersections=None, + totals=None, + horizontal=False, + sort_by="degree", + sort_categories_by="cardinality", + min_degree=None, + max_degree=None, + include_empty_subsets=False, + include_data=False, + meta_data=None, + ) + + +def test_generate_grammar_with_empty_subsets(sample_data): + grammar = generate_grammar( + df=sample_data, + intersections=sample_data, + totals=sample_data, + horizontal=False, + sort_by="degree", + sort_categories_by="cardinality", + min_degree=None, + max_degree=None, + include_empty_subsets=True, + include_data=False, + meta_data={"title": "Sample Plot", "caption": "This is a sample plot"}, + ) + assert grammar["filters"]["hideEmpty"] == False + + +def test_fetch_alt_text(test_generate_grammar): + alt_text = fetch_alt_text(test_generate_grammar) + assert isinstance(alt_text, dict) + assert "techniqueDescription" in alt_text + assert "shortDescription" in alt_text + assert "longDescription" in alt_text + + +def test_fetch_alt_text_invalid_grammar(): + with pytest.raises(Exception, match="Failed to create alt text generator"): + fetch_alt_text({}) diff --git a/upsetplot/tests/test_examples.py b/upsetplot/tests/test_examples.py index 61bde51..115b10c 100644 --- a/upsetplot/tests/test_examples.py +++ b/upsetplot/tests/test_examples.py @@ -14,6 +14,7 @@ def test_example(path): pytest.importorskip("sklearn") pytest.importorskip("seaborn") + pytest.importorskip("upset-alttxt") env = os.environ.copy() env["PYTHONPATH"] = os.getcwd() + ":" + env.get("PYTHONPATH", "") subprocess.check_output([sys.executable, path], env=env)