redefine currentness result value calculation (#274)

Gigaszi · web-flow · commit 7674eee1b6fe · 2022-09-11T19:20:52.000+02:00
* redefine currentness indicator
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,17 @@
 # Changelog
 
-# Current Main
+
+## Current Main
 
 ## Breaking Changes
 
 - Rename layer `ideal_vgi_infrastructure` to `infrastructure_lines` ([#416])
 
+### New Features
+
+- Improve Currentness indicator ([#274])
+
+[#274]: https://github.com/GIScience/ohsome-quality-analyst/pull/274
 [#416]: https://github.com/GIScience/ohsome-quality-analyst/pull/416
 
 
diff --git a/workers/ohsome_quality_analyst/indicators/currentness/indicator.py b/workers/ohsome_quality_analyst/indicators/currentness/indicator.py
@@ -16,6 +16,17 @@ class Currentness(BaseIndicator):
     Ratio of all contributions that have been edited since 2008 until the
     current day in relation with years without mapping activities in the same
     time range
+
+    Attributes:
+        threshold_4 (int): Number of years it should have been since 50%
+            of the items were last edited to be in the second best class
+        threshold_3 (int): Number of years it should have been since 50%
+            of the items were last edited to be in the second best class
+        threshold_2 (int): Number of years it should have been since 50%
+            of the items were last edited to be in the third best class
+        threshold_1 (int): Number of years it should have been since 50%
+            of the items were last edited to be in the second worst class. If
+            the result is lower than this threshold, it is assigned to the worst class
     """
 
     def __init__(
@@ -24,161 +35,205 @@ def __init__(
         feature: geojson.Feature,
     ) -> None:
         super().__init__(layer=layer, feature=feature)
-        self.threshold_yellow = 0.6
-        self.threshold_red = 0.2
+        self.threshold_4 = 1
+        self.threshold_3 = 2
+        self.threshold_2 = 4
+        self.threshold_1 = 8
         self.element_count = None
-        self.contribution_sum = 0
-        self.contributions_rel = {}
-        self.contributions_abs = {}
-        self.ratio = {}
+        self.contributions_sum = None
+        self.contributions_rel = {}  # yearly interval
+        self.contributions_abs = {}  # yearly interval
+        self.start = "2008-01-01"
+        self.end = None
+        self.low_contributions_threshold = 0
 
     async def preprocess(self) -> None:
+        """Get absolute number of contributions for each year since given start date"""
         latest_ohsome_stamp = await ohsome_client.get_latest_ohsome_timestamp()
-        # time_range for all years since 2008 and curr_year_range for the ongoing year
-        start = "2008-01-01"
         self.end = latest_ohsome_stamp.strftime("%Y-%m-%d")
-        time_range = "{0}/{1}/{2}".format(start, self.end, "P1Y")
-        curr_year_start = "{0}-01-01".format(latest_ohsome_stamp.year)
-        curr_year_range = "{0}/{1}".format(curr_year_start, self.end)
-
+        past_years_interval = "{0}/{1}/{2}".format(self.start, self.end, "P1Y")
+        current_year_interval = "{0}/{1}".format(
+            "{0}-01-01".format(latest_ohsome_stamp.year),
+            self.end,
+        )
+        # Fetch number of features
         response = await ohsome_client.query(self.layer, self.feature)
         self.element_count = response["result"][0]["value"]
         self.result.timestamp_osm = dateutil.parser.isoparse(
             response["result"][0]["timestamp"]
         )
-        response_contributions = await ohsome_client.query(
+        # Fetch all contributions of past years
+        contributions_yearly = await ohsome_client.query(
             self.layer,
             self.feature,
-            time=time_range,
+            time=past_years_interval,
             count_latest_contributions=True,
         )
-        for year in response_contributions["result"]:
-            time = dateutil.parser.isoparse(year["fromTimestamp"])
-            count = year["value"]
-            self.contributions_abs[time.strftime("%Y")] = count
-
-        curr_year_response_contributions = await ohsome_client.query(
+        # Fetch contributions of current year
+        contributions_current_year = await ohsome_client.query(
             self.layer,
             self.feature,
-            time=curr_year_range,
+            time=current_year_interval,
             count_latest_contributions=True,
         )
-        time = dateutil.parser.isoparse(
-            curr_year_response_contributions["result"][0]["fromTimestamp"]
+        # Merge contributions
+        contributions = (
+            contributions_yearly["result"] + contributions_current_year["result"]
         )
-        count = curr_year_response_contributions["result"][0]["value"]
-        self.contributions_abs[time.strftime("%Y")] = count
+        for contrib in contributions:
+            time = dateutil.parser.isoparse(contrib["fromTimestamp"])
+            count = contrib["value"]
+            self.contributions_abs[time.strftime("%Y")] = count
 
     def calculate(self) -> None:
+        """Calculate the years since over 50% of the elements were last edited"""
         logging.info(f"Calculation for indicator: {self.metadata.name}")
 
-        self.contribution_sum = sum(self.contributions_abs.values())
-        # It can be that features are counted, but have been deleted since.
+        # It can be that features have been edited, but have been deleted since.
         if self.element_count == 0:
             self.result.description = (
                 "In the area of interest no features "
                 "matching the filter are present today."
             )
             return
-        contributions_share = self.contribution_sum
-        last_edited_year = ""
-        # determine the percentage of elements that were last edited in that year
-        for year in self.contributions_abs:
-            self.ratio[year] = (contributions_share / self.contribution_sum) * 100
-            contributions_share -= self.contributions_abs[year]
-            self.contributions_rel[year] = (
-                self.contributions_abs[year] / self.contribution_sum
-            ) * 100
-            if self.contributions_rel[year] != 0:
-                last_edited_year = year
-        years_since_last_edit = int(self.result.timestamp_oqt.year) - int(
-            last_edited_year
-        )
-        percentage_contributions = 0
-        median_year = ""
-        for year in self.contributions_rel:
-            percentage_contributions += self.contributions_rel[year]
-            if percentage_contributions < 50:
-                continue
+
+        # calculate relative number of contributions for each year
+        self.contributions_sum = sum(self.contributions_abs.values())
+        contributions_rel = {}
+        contrib_rel_cum_green = 0
+        contrib_rel_cum_yellow = 0
+        contrib_rel_cum_red = 0
+        for num_of_years, (year, contrib_abs) in enumerate(
+            reversed(self.contributions_abs.items()),
+            start=1,
+        ):
+            contrib_rel = contrib_abs / self.contributions_sum
+            contributions_rel[year] = contrib_rel
+            if num_of_years < self.threshold_2:
+                contrib_rel_cum_green += contrib_rel
+            elif num_of_years < self.threshold_1:
+                contrib_rel_cum_yellow += contrib_rel
             else:
-                median_year = year
-                break
-        median_diff = int(self.result.timestamp_oqt.year) - int(median_year)
-        if median_diff <= 1:
-            param_1 = 1
-        elif median_diff <= 4:
-            param_1 = 0.6
-        else:
-            param_1 = 0.2
-        if years_since_last_edit <= 1:
-            param_2 = 1
-        elif years_since_last_edit <= 4:
-            param_2 = 0.6
-        else:
-            param_2 = 0.2
-        self.result.value = (param_1 + param_2) / 2
-        if median_diff == 0:
-            median_diff = "this year"
-        elif median_diff == 1:
-            median_diff = "in the last year"
-        else:
-            median_diff = "in the last {0} years".format(median_diff)
+                contrib_rel_cum_red += contrib_rel
+        self.contributions_rel = dict(sorted(contributions_rel.items()))
+        # calculate the year in which 50% of the total edits have been made
+        self.median_year = get_median_year(self.contributions_rel)
+        # years since last edit has been made
+        self.result.value = int(self.result.timestamp_oqt.year) - self.median_year
+
         self.result.description = Template(self.metadata.result_description).substitute(
-            years=median_diff,
+            years=self.result.value,
             layer_name=self.layer.name,
             end=self.end,
-            elements=self.contribution_sum,
+            elements=int(self.contributions_sum),
+            green=round(contrib_rel_cum_green * 100, 2),
+            yellow=round(contrib_rel_cum_yellow * 100, 2),
+            red=round(contrib_rel_cum_red * 100, 2),
+            median_years=self.result.value,
+            threshold_green=self.threshold_2 - 1,
+            threshold_yellow_start=self.threshold_2,
+            threshold_yellow_end=self.threshold_1 - 1,
+            threshold_red=self.threshold_1,
         )
-
-        if self.result.value >= self.threshold_yellow:
-            self.result.class_ = 5
+        if self.result.value > self.threshold_1:
+            self.result.class_ = 1
             self.result.description = (
-                self.result.description + self.metadata.label_description["green"]
+                self.result.description + self.metadata.label_description["red"]
             )
-        elif self.result.value >= self.threshold_red:
-            self.result.class_ = 3
+        elif self.threshold_1 >= self.result.value > self.threshold_3:
+            self.result.class_ = 2
             self.result.description = (
                 self.result.description + self.metadata.label_description["yellow"]
             )
-        elif self.result.value < self.threshold_red:
-            self.result.class_ = 1
+        elif self.threshold_3 >= self.result.value:
+            self.result.class_ = 5
             self.result.description = (
-                self.result.description + self.metadata.label_description["red"]
+                self.result.description + self.metadata.label_description["green"]
             )
         else:
             raise ValueError("Ratio has an unexpected value.")
+        last_edited_year = get_last_edited_year(self.contributions_abs)
+        self.years_since_last_edit = (
+            int(self.result.timestamp_oqt.year) - last_edited_year
+        )
+        if last_edited_year != self.result.timestamp_oqt.year:
+            self.result.description += (
+                "Attention: There was no mapping activity after "
+                + "{0} in this region.".format(last_edited_year)
+            )
+        if self.contributions_sum < self.low_contributions_threshold:
+            self.result.description += (
+                "Attention: In this region there are very few contributions "
+                + "({0}) with the given tags ".format(self.contributions_sum)
+            )
 
     def create_figure(self) -> None:
         """Create a plot.
 
         Shows the percentage of contributions for each year.
         """
-        if self.element_count == 0:
+        if self.result.label == "undefined":
+            logging.info("Result is undefined. Skipping figure creation.")
             return
         px = 1 / plt.rcParams["figure.dpi"]  # Pixel in inches
         figsize = (400 * px, 400 * px)
-        fig = plt.figure(figsize=figsize)
+        fig = plt.figure(figsize=figsize, tight_layout=True)
         ax = fig.add_subplot()
-        x = list(self.ratio.keys())
-        ax.plot(
-            x,
-            self.ratio.values(),
-            color="b",
-            label="Percentage of contributions (cumulative)",
+        patches = ax.bar(
+            self.contributions_rel.keys(),
+            height=[v * 100 for v in self.contributions_rel.values()],
+            edgecolor="black",
         )
-        ax.bar(
-            list(self.contributions_rel.keys()),
-            self.contributions_rel.values(),
-            color=self.result.label,
-            label="Percentage of contributions (year) ",
+        year_range = len(self.contributions_rel)
+        last_edited_year = get_last_edited_year(self.contributions_abs)
+        years_since_last_edit = int(self.result.timestamp_oqt.year) - last_edited_year
+        for patch in patches:
+            if year_range <= years_since_last_edit:
+                ax.text(
+                    patch.get_x(),
+                    max(self.contributions_rel.values()) * 100 / 2,
+                    "!",
+                    fontdict={"fontsize": 26},
+                )
+            if year_range >= self.threshold_1:
+                patch.set_facecolor("red")
+                year_range -= 1
+            elif year_range >= self.threshold_2:
+                patch.set_facecolor("yellow")
+                year_range -= 1
+            else:
+                patch.set_facecolor("green")
+                year_range -= 1
+        plt.axvline(
+            x=str(self.median_year),
+            linestyle=":",
+            color="black",
+            label="Median Year: {0}".format(self.median_year),
         )
-        ax.set_xticks(x[::2])
+        plt.xticks(list(self.contributions_rel.keys())[::2])
+        plt.xlabel("Year")
         plt.ylabel("Percentage of contributions")
-        plt.title("Total Contributions: %i" % self.contribution_sum)
+        plt.title("Total Contributions: %i" % self.contributions_sum)
         ax.legend(loc="lower center", bbox_to_anchor=(0.5, -0.45))
         fig.subplots_adjust(bottom=0.3)
         fig.tight_layout()
         img_data = StringIO()
-        plt.savefig(img_data, format="svg")
-        self.result.svg = img_data.getvalue()  # this is svg data
+        plt.savefig(img_data, format="svg", bbox_inches="tight")
+        self.result.svg = img_data.getvalue()
         plt.close("all")
+
+
+def get_last_edited_year(contributions: dict) -> int:
+    """Get the year in which the last edit has been made"""
+    for year, contrib in dict(reversed(sorted(contributions.items()))).items():
+        if contrib != 0:
+            return int(year)
+
+
+def get_median_year(contributions: dict) -> int:
+    """Get the year in which 50% of the total edits have been made since first edit"""
+    contrib_rel_cum = 0
+    for year, contrib in dict(sorted(contributions.items())).items():
+        contrib_rel_cum += contrib
+        if contrib_rel_cum >= 0.5:
+            return int(year)
diff --git a/workers/ohsome_quality_analyst/indicators/currentness/metadata.yaml b/workers/ohsome_quality_analyst/indicators/currentness/metadata.yaml
@@ -3,26 +3,23 @@ Currentness:
   name: Currentness
   description: |
     Ratio of all contributions that have been edited since 2008 until the current day in relation with years without mapping activities in the same
-    time range.
+    time range. 
     Refers to data quality in respect to currentness.
   label_description:
     red: |
-      More than half of the $elements edited elements were last edited over 4 years ago.
-      Be aware that there it is very likely that many map features are outdated. 
-      You should carefully check this before using the data as it indicates bad 
-      data quality in respect to currentness.
+      It is likely that many features are outdated.
     yellow: |
-      More than half of the edited elements were last edited between 1 and 4 years ago.
-      This refers to medium data quality in respect to currentness.
+      It is likely that some features are up-to-date and some features are outdated.
     green: |
-      This is a rather high value and indicates that the map features 
-      are very unlike to be outdated. This refers to good data quality in 
-      respect to currentness.
+      It is likely that most features are up-to-date.
     undefined: |
       The quality level could not be calculated for this indicator. 
       This is most likely due to the fact that no features have been mapped 
       for this area of interest. Refer to other indicators that rely on 
       an extrinsic comparison to identify if this means that data quality is 
       bad or if there is just nothing to map here.
   result_description: |
-      Over 50% of the $elements features ($layer_name) were edited $years.
+      In the last $threshold_green years $green % of the elements were edited the last time.
+      In the period from $threshold_yellow_start to $threshold_yellow_end years ago $yellow % of the elements were edited the last time.
+      The remaining $red % were last edited more than $threshold_red years ago.
+      The median currentness of the $elements features ($layer_name) is $median_years year(s).
diff --git a/workers/tests/integrationtests/api_response_schema.py b/workers/tests/integrationtests/api_response_schema.py
@@ -22,7 +22,7 @@ def get_indicator_properties_template():
         "result": {
             "timestamp_oqt": str,
             "timestamp_osm": Or(str),
-            "value": Or(float, None),
+            "value": Or(float, str, int, None),
             "label": str,
             "description": str,
             Opt("svg"): str,
diff --git a/workers/tests/integrationtests/fixtures/vcr_cassettes/test_indicator_currentness.yaml b/workers/tests/integrationtests/fixtures/vcr_cassettes/test_indicator_currentness.yaml
diff --git a/workers/tests/integrationtests/test_indicator_currentness.py b/workers/tests/integrationtests/test_indicator_currentness.py