Changes from PR#794

thenav56 · thenav56 · commit 42d7143e6d9b · 2023-06-28T09:45:42.000+05:45
https://github.com/mapswipe/python-mapswipe-workers/pull/794/files
diff --git a/mapswipe_workers/mapswipe_workers/generate_stats/overall_stats.py b/mapswipe_workers/mapswipe_workers/generate_stats/overall_stats.py
@@ -65,6 +65,19 @@ def get_project_static_info(filename: str) -> pd.DataFrame:
                 ,regexp_replace(look_for, E'[\\n\\r]+', ' ', 'g' ) as look_for
                 ,project_type
                 ,image
+                -- Custom options values
+                ,CASE
+                  WHEN project_type_specifics->'customOptions' IS NOT NULL
+                  THEN -- thus if we have answer labels use them
+                    ARRAY(
+                      SELECT json_array_elements(
+                          project_type_specifics->'customOptions'
+                      )->>'value'
+                    )
+                  ELSE -- otherwise use below label range as the mapswipe app default
+                    '{0,1,2,3}'
+                END as custom_options_values
+                -- custom_options_values -> parent - child relation
                 -- add an array of the tile server names
                 ,CASE
                   WHEN project_type_specifics->'tileServer'->'name' IS NOT NULL THEN
diff --git a/mapswipe_workers/mapswipe_workers/generate_stats/project_stats.py b/mapswipe_workers/mapswipe_workers/generate_stats/project_stats.py
@@ -1,3 +1,4 @@
+import ast
 import datetime
 import gzip
 import json
@@ -238,76 +239,45 @@ def get_groups(filename: str, project_id: str) -> pd.DataFrame:
     return df
 
 
-def calc_agreement(total: int, no: int, yes: int, maybe: int, bad: int) -> float:
+def calc_agreement(row: pd.Series) -> float:
     """
-    for each task the "agreement" is computed as defined by Scott's Pi
-    Scott's Pi is a measure for inter-rater reliability
-    https://en.wikipedia.org/wiki/Scott%27s_Pi
+    for each task the "agreement" is computed (i.e. the extent to which
+    raters agree for the i-th subject). This measure is a component of
+    Fleiss' kappa: https://en.wikipedia.org/wiki/Fleiss%27_kappa
     """
 
-    # TODO: currently this is implemented only for the 4 given categories
+    # Calculate total count as the sum of all categories
+    n = row["total_count"]
 
-    if total == 1:
-        agreement = 1.0
+    row = row.drop(labels=["total_count"])
+    # extent to which raters agree for the ith subject
+    # set agreement to None if only one user contributed
+    if n == 1 or n == 0:
+        agreement = None
     else:
-        agreement = (
-            1.0
-            / (total * (total - 1))
-            * (
-                (no * (no - 1))
-                + (yes * (yes - 1))
-                + (maybe * (maybe - 1))
-                + (bad * (bad - 1))
-            )
-        )
+        agreement = (sum([i**2 for i in row]) - n) / (n * (n - 1))
 
     return agreement
 
 
-def calc_share(total: int, no: int, yes: int, maybe: int, bad: int) -> List[float]:
+def calc_share(df: pd.DataFrame) -> pd.DataFrame:
     """Calculate the share of each category on the total count."""
-    no_share = no / total
-    yes_share = yes / total
-    maybe_share = maybe / total
-    bad_share = bad / total
-
-    return [no_share, yes_share, maybe_share, bad_share]
-
-
-def calc_count(row) -> List[int]:
-    """
-    Check if a count exists for each category ("no", "yes", "maybe", "bad").
-    Then calculate total count as the sum of all categories.
-    """
-
-    try:
-        no_count = row[0]
-    except KeyError:
-        no_count = 0
-
-    try:
-        yes_count = row[1]
-    except KeyError:
-        yes_count = 0
-
-    try:
-        maybe_count = row[2]
-    except KeyError:
-        maybe_count = 0
-
-    try:
-        bad_count = row[3]
-    except KeyError:
-        bad_count = 0
+    share_df = df.filter(like="count").div(df.total_count, axis=0)
+    share_df.drop("total_count", inplace=True, axis=1)
+    share_df.columns = share_df.columns.str.replace("_count", "_share")
+    return df.join(share_df)
 
-    total_count = no_count + yes_count + maybe_count + bad_count
-    assert total_count > 0, "Total count for result must be bigger than zero."
 
-    return [total_count, no_count, yes_count, maybe_count, bad_count]
+def calc_count(df: pd.DataFrame) -> pd.DataFrame:
+    df_new = df.filter(like="count")
+    df_new_sum = df_new.sum(axis=1)
+    return df_new_sum
 
 
-def calc_quadkey(row):
+def calc_quadkey(row: pd.DataFrame):
     """Calculate quadkey based on task id."""
+    # TODO: This does not make sense for media type, digtitalization.
+    #  For these projects types we should move to project classes.
     try:
         tile_z, tile_x, tile_y = row["task_id"].split("-")
         quadkey = tile_functions.tile_coords_and_zoom_to_quadKey(
@@ -320,8 +290,26 @@ def calc_quadkey(row):
     return quadkey
 
 
+def add_missing_result_columns(
+    df: pd.DataFrame,
+    custom_options_values: pd.Series
+) -> pd.DataFrame:
+    """
+    Check if all possible answers columns are included in the grouped results
+    data frame and add columns if missing.
+    """
+
+    all_answer_label_values_list = list(
+        ast.literal_eval(custom_options_values.item())
+    )
+    df = df.reindex(columns=all_answer_label_values_list, fill_value=0)
+    return df
+
+
 def get_agg_results_by_task_id(
-    results_df: pd.DataFrame, tasks_df: pd.DataFrame
+    results_df: pd.DataFrame,
+    tasks_df: pd.DataFrame,
+    custom_options_values: pd.Series,
 ) -> pd.DataFrame:
     """
     For each task several users contribute results.
@@ -339,6 +327,7 @@ def get_agg_results_by_task_id(
     ----------
     results_df: pd.DataFrame
     tasks_df: pd.DataFrame
+    custom_options_values: pd.Series
     """
 
     results_by_task_id_df = (
@@ -347,23 +336,27 @@ def get_agg_results_by_task_id(
         .unstack(fill_value=0)
     )
 
-    # calculate total count and check if other counts are defined
-    results_by_task_id_df[["total_count", 0, 1, 2, 3]] = results_by_task_id_df.apply(
-        lambda row: calc_count(row), axis=1, result_type="expand"
+    # add columns for answer options that were not chosen for any task
+    results_by_task_id_df = add_missing_result_columns(
+        results_by_task_id_df,
+        custom_options_values,
     )
 
+    # TODO: Add logic for parent values using sub values
+    # [<parent_value> = <parent_value> + <child_1_value> + .. <child_N_value>]
+
+    # needed for ogr2ogr todo: might be legacy?
+    results_by_task_id_df = results_by_task_id_df.add_suffix("_count")
+
+    # calculate total count of votes per task
+    results_by_task_id_df["total_count"] = calc_count(results_by_task_id_df)
+
     # calculate share based on counts
-    results_by_task_id_df[
-        ["0_share", "1_share", "2_share", "3_share"]
-    ] = results_by_task_id_df.apply(
-        lambda row: calc_share(row["total_count"], row[0], row[1], row[2], row[3]),
-        axis=1,
-        result_type="expand",
-    )
+    results_by_task_id_df = calc_share(results_by_task_id_df)
 
     # calculate agreement
     results_by_task_id_df["agreement"] = results_by_task_id_df.apply(
-        lambda row: calc_agreement(row["total_count"], row[0], row[1], row[2], row[3]),
+        calc_agreement,
         axis=1,
     )
     logger.info("calculated agreement")
@@ -383,11 +376,6 @@ def get_agg_results_by_task_id(
     )
     logger.info("added geometry to aggregated results")
 
-    # rename columns, ogr2ogr will fail otherwise
-    agg_results_df.rename(
-        columns={0: "0_count", 1: "1_count", 2: "2_count", 3: "3_count"}, inplace=True
-    )
-
     return agg_results_df
 
 
@@ -430,7 +418,11 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
             add_metadata = False
 
         # aggregate results by task id
-        agg_results_df = get_agg_results_by_task_id(results_df, tasks_df)
+        agg_results_df = get_agg_results_by_task_id(
+            results_df,
+            tasks_df,
+            project_info["custom_options_values"],
+        )
         agg_results_df.to_csv(agg_results_filename, index_label="idx")
 
         geojson_functions.gzipped_csv_to_gzipped_geojson(
diff --git a/mapswipe_workers/mapswipe_workers/generate_stats/user_stats.py b/mapswipe_workers/mapswipe_workers/generate_stats/user_stats.py
@@ -1,8 +1,40 @@
 import pandas as pd
 
 
+def get_agreeing_contributions_per_user_and_task(row):
+    """
+    Compare user contibution to classifications of other users by calculating
+    the number of agreeing and disagreeing results.
+    """
+
+    # XXX: We need to figure what which values to check? Parent or child
+    r = row["result"]
+    count_str = f"{r}_count"
+    # ignore -999 values
+    if count_str == "-999_count":
+        return 0
+    else:
+        return row[count_str] - 1
+
+
+def get_disagreeing_contributions_per_user_and_task(row):
+    """
+    Compare user contibution to classifications of other users by calculating
+    the number of agreeing and disagreeing results.
+    """
+
+    total_count = row["total_count"]
+    if total_count == 0:
+        return 0
+    else:
+        agreeing_contributions = row["agreeing_contributions"]
+        disagreeing_contributions = total_count - (agreeing_contributions + 1)
+        return disagreeing_contributions
+
+
 def get_agg_results_by_user_id(
-    results_df: pd.DataFrame, agg_results_df: pd.DataFrame
+    results_df: pd.DataFrame,
+    agg_results_df: pd.DataFrame
 ) -> pd.DataFrame:
     """
     For each users we calcuate the number of total contributions (tasks)
@@ -15,26 +47,20 @@ def get_agg_results_by_user_id(
     Returns a pandas dataframe.
     """
     raw_contributions_df = results_df.merge(
-        agg_results_df, left_on="task_id", right_on="task_id"
+        agg_results_df,
+        left_on="task_id",
+        right_on="task_id",
+    )
+
+    raw_contributions_df["agreeing_contributions"] = raw_contributions_df.apply(
+        get_agreeing_contributions_per_user_and_task,
+        axis=1,
+    )
+
+    raw_contributions_df["disagreeing_contributions"] = raw_contributions_df.apply(
+        get_disagreeing_contributions_per_user_and_task,
+        axis=1,
     )
-    # compare to classifications of other users
-    # Calc number of agreeig and disagreeing results from other users.
-    raw_contributions_df.loc[
-        raw_contributions_df["result"] == 0, "agreeing_contributions"
-    ] = (raw_contributions_df["0_count"] - 1)
-    raw_contributions_df.loc[
-        raw_contributions_df["result"] == 1, "agreeing_contributions"
-    ] = (raw_contributions_df["1_count"] - 1)
-    raw_contributions_df.loc[
-        raw_contributions_df["result"] == 2, "agreeing_contributions"
-    ] = (raw_contributions_df["2_count"] - 1)
-    raw_contributions_df.loc[
-        raw_contributions_df["result"] == 3, "agreeing_contributions"
-    ] = (raw_contributions_df["3_count"] - 1)
-
-    raw_contributions_df["disagreeing_contributions"] = raw_contributions_df[
-        "total_count"
-    ] - (raw_contributions_df["agreeing_contributions"] + 1)
 
     agg_results_by_user_id_df = raw_contributions_df.groupby(
         ["project_id", "user_id", "username"]