mapswipe
diff --git a/‎api/nginx.conf‎
Lines changed: 4 additions & 0 deletions b/‎api/nginx.conf‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎mapswipe_workers/mapswipe_workers/generate_stats/generate_stats.py‎
Lines changed: 61 additions & 0 deletions b/‎mapswipe_workers/mapswipe_workers/generate_stats/generate_stats.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎mapswipe_workers/mapswipe_workers/generate_stats/overall_stats.py‎
Lines changed: 10 additions & 0 deletions b/‎mapswipe_workers/mapswipe_workers/generate_stats/overall_stats.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎mapswipe_workers/mapswipe_workers/generate_stats/project_stats.py‎
Lines changed: 96 additions & 72 deletions b/‎mapswipe_workers/mapswipe_workers/generate_stats/project_stats.py‎
Lines changed: 96 additions & 72 deletions
diff --git a/‎mapswipe_workers/mapswipe_workers/generate_stats/tasking_manager_geometries.py‎
Lines changed: 13 additions & 8 deletions b/‎mapswipe_workers/mapswipe_workers/generate_stats/tasking_manager_geometries.py‎
Lines changed: 13 additions & 8 deletions
@@ -2,6 +2,10 @@ server {
     listen 80;
     server_name api;
 
+    gzip on;
+    gzip_comp_level 2;
+    gzip_types text/plain text/csv text/css application/json text/javascript;
+
     location / {
         alias /usr/share/nginx/html/api/;
         autoindex on;
 
@@ -1,11 +1,71 @@
+import csv
 import datetime as dt
+import hashlib
+import os
+import shutil
 from typing import List, Optional
 
 from mapswipe_workers import auth
 from mapswipe_workers.definitions import DATA_PATH, logger
 from mapswipe_workers.generate_stats import overall_stats, project_stats
 
 
+def generate_data_for_mapswipe_website():
+    """
+    Generate data for website
+    """
+    website_data_dest = f"{DATA_PATH}/api/website-data"
+
+    # TODO: Move to utils
+    def _compute_md5(file_name):
+        hash_md5 = hashlib.md5()
+        with open(file_name, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()
+
+    def _project_history_zip():
+        project_history_file = f"{website_data_dest}/project-history"
+        zip_file_name = shutil.make_archive(
+            project_history_file,
+            "zip",
+            f"{DATA_PATH}/api/history/",
+        )
+        logger.info("finished generate project-history zip")
+        return zip_file_name
+
+    def _manifest_file():
+        endpoints_dir = f"{DATA_PATH}/api/"
+        manifest_file = f"{website_data_dest}/overall-endpoints.csv"
+        with open(manifest_file, "w") as fp:
+            csv_writer = csv.writer(fp)
+            csv_writer.writerow(["endpoints", "size_bytes"])
+            for path, _, files in os.walk(endpoints_dir):
+                for name in files:
+                    file_path = os.path.join(path, name)
+                    csv_writer.writerow(
+                        [
+                            "/api/" + file_path.split("/api/")[1],
+                            os.path.getsize(file_path),
+                        ]
+                    )
+        logger.info("finished generate endpoints manifest for existing stats")
+        return manifest_file
+
+    def _generate_file_hash(files):
+        for file in files:
+            md5_hash = _compute_md5(file)
+            with open(f"{file}.md5", "w") as fp:
+                fp.write(md5_hash)
+
+    files_to_track_for_checksum = [
+        f"{DATA_PATH}/api/projects/projects_centroid.geojson",
+        f"{DATA_PATH}/api/projects/projects_geom.geojson",
+    ]
+    files_to_track_for_checksum.extend([_project_history_zip(), _manifest_file()])
+    _generate_file_hash(files_to_track_for_checksum)
+
+
 def get_recent_projects(hours: int = 3):
     """Get ids for projects when results have been submitted within the last x hours."""
     pg_db = auth.postgresDB()
@@ -108,6 +168,7 @@ def generate_stats(project_id_list: Optional[List[str]] = None):
         overall_stats.get_overall_stats(projects_df, overall_stats_filename)
 
     logger.info(f"finished generate stats for: {project_id_list}")
+    generate_data_for_mapswipe_website()
 
 
 def generate_stats_all_projects():
 
@@ -64,6 +64,16 @@ def get_project_static_info(filename: str) -> pd.DataFrame:
                 project_details
                 ,regexp_replace(look_for, E'[\\n\\r]+', ' ', 'g' ) as look_for
                 ,project_type
+                ,image
+                ,created
+                -- Custom options values
+                ,CASE
+                  WHEN project_type_specifics->'customOptions' IS NOT NULL
+                  THEN -- thus if we have answer labels use them
+                    (project_type_specifics->'customOptions')::TEXT
+                  ELSE -- otherwise use below label range as the mapswipe app default
+                    '[{"value": 0}, {"value": 1}, {"value": 2}, {"value": 3}]'::TEXT
+                END as custom_options
                 -- add an array of the tile server names
                 ,CASE
                   WHEN project_type_specifics->'tileServer'->'name' IS NOT NULL THEN
 
@@ -1,9 +1,10 @@
+import ast
 import datetime
 import gzip
 import json
 import os
 import tempfile
-from typing import List
+import typing
 
 import pandas as pd
 from pandas.api.types import is_numeric_dtype
@@ -238,76 +239,57 @@ def get_groups(filename: str, project_id: str) -> pd.DataFrame:
     return df
 
 
-def calc_agreement(total: int, no: int, yes: int, maybe: int, bad: int) -> float:
+def calc_agreement(row: pd.Series) -> float:
     """
-    for each task the "agreement" is computed as defined by Scott's Pi
-    Scott's Pi is a measure for inter-rater reliability
-    https://en.wikipedia.org/wiki/Scott%27s_Pi
+    for each task the "agreement" is computed (i.e. the extent to which
+    raters agree for the i-th subject). This measure is a component of
+    Fleiss' kappa: https://en.wikipedia.org/wiki/Fleiss%27_kappa
     """
 
-    # TODO: currently this is implemented only for the 4 given categories
+    # Calculate total count as the sum of all categories
+    n = row["total_count"]
 
-    if total == 1:
-        agreement = 1.0
+    row = row.drop(labels=["total_count"])
+    # extent to which raters agree for the ith subject
+    # set agreement to None if only one user contributed
+    if n == 1 or n == 0:
+        agreement = None
     else:
-        agreement = (
-            1.0
-            / (total * (total - 1))
-            * (
-                (no * (no - 1))
-                + (yes * (yes - 1))
-                + (maybe * (maybe - 1))
-                + (bad * (bad - 1))
-            )
-        )
+        agreement = (sum([i**2 for i in row]) - n) / (n * (n - 1))
 
     return agreement
 
 
-def calc_share(total: int, no: int, yes: int, maybe: int, bad: int) -> List[float]:
+def calc_share(df: pd.DataFrame) -> pd.DataFrame:
     """Calculate the share of each category on the total count."""
-    no_share = no / total
-    yes_share = yes / total
-    maybe_share = maybe / total
-    bad_share = bad / total
-
-    return [no_share, yes_share, maybe_share, bad_share]
-
-
-def calc_count(row) -> List[int]:
-    """
-    Check if a count exists for each category ("no", "yes", "maybe", "bad").
-    Then calculate total count as the sum of all categories.
-    """
-
-    try:
-        no_count = row[0]
-    except KeyError:
-        no_count = 0
+    share_df = df.filter(like="count").div(df.total_count, axis=0)
+    share_df.drop("total_count", inplace=True, axis=1)
+    share_df.columns = share_df.columns.str.replace("_count", "_share")
+    return df.join(share_df)
 
-    try:
-        yes_count = row[1]
-    except KeyError:
-        yes_count = 0
-
-    try:
-        maybe_count = row[2]
-    except KeyError:
-        maybe_count = 0
 
-    try:
-        bad_count = row[3]
-    except KeyError:
-        bad_count = 0
+def calc_parent_option_count(
+    df: pd.DataFrame,
+    custom_options: typing.Dict[int, typing.Set[int]],
+) -> pd.DataFrame:
+    df_new = df.copy()
+    # Update option count using sub options count
+    for option, sub_options in custom_options.items():
+        for sub_option in sub_options:
+            df_new[f"{option}_count"] += df_new[f"{sub_option}_count"]
+    return df_new
 
-    total_count = no_count + yes_count + maybe_count + bad_count
-    assert total_count > 0, "Total count for result must be bigger than zero."
 
-    return [total_count, no_count, yes_count, maybe_count, bad_count]
+def calc_count(df: pd.DataFrame) -> pd.DataFrame:
+    df_new = df.filter(like="count")
+    df_new_sum = df_new.sum(axis=1)
+    return df_new_sum
 
 
-def calc_quadkey(row):
+def calc_quadkey(row: pd.DataFrame):
     """Calculate quadkey based on task id."""
+    # TODO: This does not make sense for media type, digtitalization.
+    #  For these projects types we should move to project classes.
     try:
         tile_z, tile_x, tile_y = row["task_id"].split("-")
         quadkey = tile_functions.tile_coords_and_zoom_to_quadKey(
@@ -320,8 +302,42 @@ def calc_quadkey(row):
     return quadkey
 
 
+def get_custom_options(custom_options: pd.Series) -> typing.Dict[int, typing.Set[int]]:
+    eval_value = ast.literal_eval(custom_options.item())
+    return {
+        option["value"]: {
+            sub_option["value"] for sub_option in option.get("subOptions", [])
+        }
+        for option in eval_value
+    }
+
+
+def add_missing_result_columns(
+    df: typing.Union[pd.DataFrame, pd.Series],
+    custom_options: typing.Dict[int, typing.Set[int]],
+) -> pd.DataFrame:
+    """
+    Check if all possible answers columns are included in the grouped results
+    data frame and add columns if missing.
+    """
+
+    all_answer_label_values_set = set(
+        [
+            _option
+            for option, sub_options in custom_options.items()
+            for _option in [option, *sub_options]
+        ]
+    )
+    return df.reindex(
+        columns=sorted(all_answer_label_values_set),
+        fill_value=0,
+    )
+
+
 def get_agg_results_by_task_id(
-    results_df: pd.DataFrame, tasks_df: pd.DataFrame
+    results_df: pd.DataFrame,
+    tasks_df: pd.DataFrame,
+    custom_options_raw: pd.Series,
 ) -> pd.DataFrame:
     """
     For each task several users contribute results.
@@ -339,6 +355,7 @@ def get_agg_results_by_task_id(
     ----------
     results_df: pd.DataFrame
     tasks_df: pd.DataFrame
+    custom_options_raw: pd.Series
     """
 
     results_by_task_id_df = (
@@ -347,23 +364,31 @@ def get_agg_results_by_task_id(
         .unstack(fill_value=0)
     )
 
-    # calculate total count and check if other counts are defined
-    results_by_task_id_df[["total_count", 0, 1, 2, 3]] = results_by_task_id_df.apply(
-        lambda row: calc_count(row), axis=1, result_type="expand"
+    custom_options = get_custom_options(custom_options_raw)
+
+    # add columns for answer options that were not chosen for any task
+    results_by_task_id_df = add_missing_result_columns(
+        results_by_task_id_df,
+        custom_options,
     )
 
-    # calculate share based on counts
-    results_by_task_id_df[
-        ["0_share", "1_share", "2_share", "3_share"]
-    ] = results_by_task_id_df.apply(
-        lambda row: calc_share(row["total_count"], row[0], row[1], row[2], row[3]),
-        axis=1,
-        result_type="expand",
+    # needed for ogr2ogr todo: might be legacy?
+    results_by_task_id_df = results_by_task_id_df.add_suffix("_count")
+
+    # calculate total count of votes per task
+    results_by_task_id_df["total_count"] = calc_count(results_by_task_id_df)
+
+    results_by_task_id_df = calc_parent_option_count(
+        results_by_task_id_df,
+        custom_options,
     )
 
+    # calculate share based on counts
+    results_by_task_id_df = calc_share(results_by_task_id_df)
+
     # calculate agreement
     results_by_task_id_df["agreement"] = results_by_task_id_df.apply(
-        lambda row: calc_agreement(row["total_count"], row[0], row[1], row[2], row[3]),
+        calc_agreement,
         axis=1,
     )
     logger.info("calculated agreement")
@@ -383,11 +408,6 @@ def get_agg_results_by_task_id(
     )
     logger.info("added geometry to aggregated results")
 
-    # rename columns, ogr2ogr will fail otherwise
-    agg_results_df.rename(
-        columns={0: "0_count", 1: "1_count", 2: "2_count", 3: "3_count"}, inplace=True
-    )
-
     return agg_results_df
 
 
@@ -430,7 +450,11 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
             add_metadata = False
 
         # aggregate results by task id
-        agg_results_df = get_agg_results_by_task_id(results_df, tasks_df)
+        agg_results_df = get_agg_results_by_task_id(
+            results_df,
+            tasks_df,
+            project_info["custom_options"],
+        )
         agg_results_df.to_csv(agg_results_filename, index_label="idx")
 
         geojson_functions.gzipped_csv_to_gzipped_geojson(
 
@@ -19,10 +19,14 @@ def load_data(project_id: str, gzipped_csv_file: str) -> list:
     project_data = []
     with gzip.open(gzipped_csv_file, mode="rt") as f:
         reader = csv.reader(f, delimiter=",")
+        column_index_map = {}
 
         for i, row in enumerate(reader):
             if i == 0:
                 # skip header
+                column_index_map = {
+                    column_label: index for index, column_label in enumerate(row)
+                }
                 continue
 
             # the last row of the csv might contain a comment about data use
@@ -42,14 +46,15 @@ def load_data(project_id: str, gzipped_csv_file: str) -> list:
                     "task_x": task_x,
                     "task_y": task_y,
                     "task_z": task_z,
-                    "no_count": int(row[2]),
-                    "yes_count": int(row[3]),
-                    "maybe_count": int(row[4]),
-                    "bad_imagery_count": int(row[5]),
-                    "no_share": float(row[7]),
-                    "yes_share": float(row[8]),
-                    "maybe_share": float(row[9]),
-                    "bad_imagery_share": float(row[10]),
+                    # XXX: Assuming 0->No, 1->Yes, 2->Maybe, 3->Bad
+                    "no_count": int(column_index_map.get("0_count", 0)),
+                    "yes_count": int(column_index_map.get("1_count", 0)),
+                    "maybe_count": int(column_index_map.get("2_count", 0)),
+                    "bad_imagery_count": int(column_index_map.get("3_count", 0)),
+                    "no_share": float(column_index_map.get("0_count", 0)),
+                    "yes_share": float(column_index_map.get("1_count", 0)),
+                    "maybe_share": float(column_index_map.get("2_count", 0)),
+                    "bad_imagery_share": float(column_index_map.get("3_count", 0)),
                     "wkt": tile_functions.geometry_from_tile_coords(
                         task_x, task_y, task_z
                     ),