Skip to content

Commit 4d082e1

Browse files
authored
Merge pull request #859 from mapswipe/feat/validation-backend
Feat/validation backend
2 parents 6597c97 + 957a9ab commit 4d082e1

File tree

12 files changed

+435
-101
lines changed

12 files changed

+435
-101
lines changed

api/nginx.conf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ server {
22
listen 80;
33
server_name api;
44

5+
gzip on;
6+
gzip_comp_level 2;
7+
gzip_types text/plain text/csv text/css application/json text/javascript;
8+
59
location / {
610
alias /usr/share/nginx/html/api/;
711
autoindex on;

mapswipe_workers/mapswipe_workers/generate_stats/generate_stats.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,71 @@
1+
import csv
12
import datetime as dt
3+
import hashlib
4+
import os
5+
import shutil
26
from typing import List, Optional
37

48
from mapswipe_workers import auth
59
from mapswipe_workers.definitions import DATA_PATH, logger
610
from mapswipe_workers.generate_stats import overall_stats, project_stats
711

812

13+
def generate_data_for_mapswipe_website():
14+
"""
15+
Generate data for website
16+
"""
17+
website_data_dest = f"{DATA_PATH}/api/website-data"
18+
19+
# TODO: Move to utils
20+
def _compute_md5(file_name):
21+
hash_md5 = hashlib.md5()
22+
with open(file_name, "rb") as f:
23+
for chunk in iter(lambda: f.read(4096), b""):
24+
hash_md5.update(chunk)
25+
return hash_md5.hexdigest()
26+
27+
def _project_history_zip():
28+
project_history_file = f"{website_data_dest}/project-history"
29+
zip_file_name = shutil.make_archive(
30+
project_history_file,
31+
"zip",
32+
f"{DATA_PATH}/api/history/",
33+
)
34+
logger.info("finished generate project-history zip")
35+
return zip_file_name
36+
37+
def _manifest_file():
38+
endpoints_dir = f"{DATA_PATH}/api/"
39+
manifest_file = f"{website_data_dest}/overall-endpoints.csv"
40+
with open(manifest_file, "w") as fp:
41+
csv_writer = csv.writer(fp)
42+
csv_writer.writerow(["endpoints", "size_bytes"])
43+
for path, _, files in os.walk(endpoints_dir):
44+
for name in files:
45+
file_path = os.path.join(path, name)
46+
csv_writer.writerow(
47+
[
48+
"/api/" + file_path.split("/api/")[1],
49+
os.path.getsize(file_path),
50+
]
51+
)
52+
logger.info("finished generate endpoints manifest for existing stats")
53+
return manifest_file
54+
55+
def _generate_file_hash(files):
56+
for file in files:
57+
md5_hash = _compute_md5(file)
58+
with open(f"{file}.md5", "w") as fp:
59+
fp.write(md5_hash)
60+
61+
files_to_track_for_checksum = [
62+
f"{DATA_PATH}/api/projects/projects_centroid.geojson",
63+
f"{DATA_PATH}/api/projects/projects_geom.geojson",
64+
]
65+
files_to_track_for_checksum.extend([_project_history_zip(), _manifest_file()])
66+
_generate_file_hash(files_to_track_for_checksum)
67+
68+
969
def get_recent_projects(hours: int = 3):
1070
"""Get ids for projects when results have been submitted within the last x hours."""
1171
pg_db = auth.postgresDB()
@@ -108,6 +168,7 @@ def generate_stats(project_id_list: Optional[List[str]] = None):
108168
overall_stats.get_overall_stats(projects_df, overall_stats_filename)
109169

110170
logger.info(f"finished generate stats for: {project_id_list}")
171+
generate_data_for_mapswipe_website()
111172

112173

113174
def generate_stats_all_projects():

mapswipe_workers/mapswipe_workers/generate_stats/overall_stats.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,16 @@ def get_project_static_info(filename: str) -> pd.DataFrame:
6464
project_details
6565
,regexp_replace(look_for, E'[\\n\\r]+', ' ', 'g' ) as look_for
6666
,project_type
67+
,image
68+
,created
69+
-- Custom options values
70+
,CASE
71+
WHEN project_type_specifics->'customOptions' IS NOT NULL
72+
THEN -- thus if we have answer labels use them
73+
(project_type_specifics->'customOptions')::TEXT
74+
ELSE -- otherwise use below label range as the mapswipe app default
75+
'[{"value": 0}, {"value": 1}, {"value": 2}, {"value": 3}]'::TEXT
76+
END as custom_options
6777
-- add an array of the tile server names
6878
,CASE
6979
WHEN project_type_specifics->'tileServer'->'name' IS NOT NULL THEN

mapswipe_workers/mapswipe_workers/generate_stats/project_stats.py

Lines changed: 96 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1+
import ast
12
import datetime
23
import gzip
34
import json
45
import os
56
import tempfile
6-
from typing import List
7+
import typing
78

89
import pandas as pd
910
from pandas.api.types import is_numeric_dtype
@@ -238,76 +239,57 @@ def get_groups(filename: str, project_id: str) -> pd.DataFrame:
238239
return df
239240

240241

241-
def calc_agreement(total: int, no: int, yes: int, maybe: int, bad: int) -> float:
242+
def calc_agreement(row: pd.Series) -> float:
242243
"""
243-
for each task the "agreement" is computed as defined by Scott's Pi
244-
Scott's Pi is a measure for inter-rater reliability
245-
https://en.wikipedia.org/wiki/Scott%27s_Pi
244+
for each task the "agreement" is computed (i.e. the extent to which
245+
raters agree for the i-th subject). This measure is a component of
246+
Fleiss' kappa: https://en.wikipedia.org/wiki/Fleiss%27_kappa
246247
"""
247248

248-
# TODO: currently this is implemented only for the 4 given categories
249+
# Calculate total count as the sum of all categories
250+
n = row["total_count"]
249251

250-
if total == 1:
251-
agreement = 1.0
252+
row = row.drop(labels=["total_count"])
253+
# extent to which raters agree for the ith subject
254+
# set agreement to None if only one user contributed
255+
if n == 1 or n == 0:
256+
agreement = None
252257
else:
253-
agreement = (
254-
1.0
255-
/ (total * (total - 1))
256-
* (
257-
(no * (no - 1))
258-
+ (yes * (yes - 1))
259-
+ (maybe * (maybe - 1))
260-
+ (bad * (bad - 1))
261-
)
262-
)
258+
agreement = (sum([i**2 for i in row]) - n) / (n * (n - 1))
263259

264260
return agreement
265261

266262

267-
def calc_share(total: int, no: int, yes: int, maybe: int, bad: int) -> List[float]:
263+
def calc_share(df: pd.DataFrame) -> pd.DataFrame:
268264
"""Calculate the share of each category on the total count."""
269-
no_share = no / total
270-
yes_share = yes / total
271-
maybe_share = maybe / total
272-
bad_share = bad / total
273-
274-
return [no_share, yes_share, maybe_share, bad_share]
275-
276-
277-
def calc_count(row) -> List[int]:
278-
"""
279-
Check if a count exists for each category ("no", "yes", "maybe", "bad").
280-
Then calculate total count as the sum of all categories.
281-
"""
282-
283-
try:
284-
no_count = row[0]
285-
except KeyError:
286-
no_count = 0
265+
share_df = df.filter(like="count").div(df.total_count, axis=0)
266+
share_df.drop("total_count", inplace=True, axis=1)
267+
share_df.columns = share_df.columns.str.replace("_count", "_share")
268+
return df.join(share_df)
287269

288-
try:
289-
yes_count = row[1]
290-
except KeyError:
291-
yes_count = 0
292-
293-
try:
294-
maybe_count = row[2]
295-
except KeyError:
296-
maybe_count = 0
297270

298-
try:
299-
bad_count = row[3]
300-
except KeyError:
301-
bad_count = 0
271+
def calc_parent_option_count(
272+
df: pd.DataFrame,
273+
custom_options: typing.Dict[int, typing.Set[int]],
274+
) -> pd.DataFrame:
275+
df_new = df.copy()
276+
# Update option count using sub options count
277+
for option, sub_options in custom_options.items():
278+
for sub_option in sub_options:
279+
df_new[f"{option}_count"] += df_new[f"{sub_option}_count"]
280+
return df_new
302281

303-
total_count = no_count + yes_count + maybe_count + bad_count
304-
assert total_count > 0, "Total count for result must be bigger than zero."
305282

306-
return [total_count, no_count, yes_count, maybe_count, bad_count]
283+
def calc_count(df: pd.DataFrame) -> pd.DataFrame:
284+
df_new = df.filter(like="count")
285+
df_new_sum = df_new.sum(axis=1)
286+
return df_new_sum
307287

308288

309-
def calc_quadkey(row):
289+
def calc_quadkey(row: pd.DataFrame):
310290
"""Calculate quadkey based on task id."""
291+
# TODO: This does not make sense for media type, digtitalization.
292+
# For these projects types we should move to project classes.
311293
try:
312294
tile_z, tile_x, tile_y = row["task_id"].split("-")
313295
quadkey = tile_functions.tile_coords_and_zoom_to_quadKey(
@@ -320,8 +302,42 @@ def calc_quadkey(row):
320302
return quadkey
321303

322304

305+
def get_custom_options(custom_options: pd.Series) -> typing.Dict[int, typing.Set[int]]:
306+
eval_value = ast.literal_eval(custom_options.item())
307+
return {
308+
option["value"]: {
309+
sub_option["value"] for sub_option in option.get("subOptions", [])
310+
}
311+
for option in eval_value
312+
}
313+
314+
315+
def add_missing_result_columns(
316+
df: typing.Union[pd.DataFrame, pd.Series],
317+
custom_options: typing.Dict[int, typing.Set[int]],
318+
) -> pd.DataFrame:
319+
"""
320+
Check if all possible answers columns are included in the grouped results
321+
data frame and add columns if missing.
322+
"""
323+
324+
all_answer_label_values_set = set(
325+
[
326+
_option
327+
for option, sub_options in custom_options.items()
328+
for _option in [option, *sub_options]
329+
]
330+
)
331+
return df.reindex(
332+
columns=sorted(all_answer_label_values_set),
333+
fill_value=0,
334+
)
335+
336+
323337
def get_agg_results_by_task_id(
324-
results_df: pd.DataFrame, tasks_df: pd.DataFrame
338+
results_df: pd.DataFrame,
339+
tasks_df: pd.DataFrame,
340+
custom_options_raw: pd.Series,
325341
) -> pd.DataFrame:
326342
"""
327343
For each task several users contribute results.
@@ -339,6 +355,7 @@ def get_agg_results_by_task_id(
339355
----------
340356
results_df: pd.DataFrame
341357
tasks_df: pd.DataFrame
358+
custom_options_raw: pd.Series
342359
"""
343360

344361
results_by_task_id_df = (
@@ -347,23 +364,31 @@ def get_agg_results_by_task_id(
347364
.unstack(fill_value=0)
348365
)
349366

350-
# calculate total count and check if other counts are defined
351-
results_by_task_id_df[["total_count", 0, 1, 2, 3]] = results_by_task_id_df.apply(
352-
lambda row: calc_count(row), axis=1, result_type="expand"
367+
custom_options = get_custom_options(custom_options_raw)
368+
369+
# add columns for answer options that were not chosen for any task
370+
results_by_task_id_df = add_missing_result_columns(
371+
results_by_task_id_df,
372+
custom_options,
353373
)
354374

355-
# calculate share based on counts
356-
results_by_task_id_df[
357-
["0_share", "1_share", "2_share", "3_share"]
358-
] = results_by_task_id_df.apply(
359-
lambda row: calc_share(row["total_count"], row[0], row[1], row[2], row[3]),
360-
axis=1,
361-
result_type="expand",
375+
# needed for ogr2ogr todo: might be legacy?
376+
results_by_task_id_df = results_by_task_id_df.add_suffix("_count")
377+
378+
# calculate total count of votes per task
379+
results_by_task_id_df["total_count"] = calc_count(results_by_task_id_df)
380+
381+
results_by_task_id_df = calc_parent_option_count(
382+
results_by_task_id_df,
383+
custom_options,
362384
)
363385

386+
# calculate share based on counts
387+
results_by_task_id_df = calc_share(results_by_task_id_df)
388+
364389
# calculate agreement
365390
results_by_task_id_df["agreement"] = results_by_task_id_df.apply(
366-
lambda row: calc_agreement(row["total_count"], row[0], row[1], row[2], row[3]),
391+
calc_agreement,
367392
axis=1,
368393
)
369394
logger.info("calculated agreement")
@@ -383,11 +408,6 @@ def get_agg_results_by_task_id(
383408
)
384409
logger.info("added geometry to aggregated results")
385410

386-
# rename columns, ogr2ogr will fail otherwise
387-
agg_results_df.rename(
388-
columns={0: "0_count", 1: "1_count", 2: "2_count", 3: "3_count"}, inplace=True
389-
)
390-
391411
return agg_results_df
392412

393413

@@ -430,7 +450,11 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
430450
add_metadata = False
431451

432452
# aggregate results by task id
433-
agg_results_df = get_agg_results_by_task_id(results_df, tasks_df)
453+
agg_results_df = get_agg_results_by_task_id(
454+
results_df,
455+
tasks_df,
456+
project_info["custom_options"],
457+
)
434458
agg_results_df.to_csv(agg_results_filename, index_label="idx")
435459

436460
geojson_functions.gzipped_csv_to_gzipped_geojson(

mapswipe_workers/mapswipe_workers/generate_stats/tasking_manager_geometries.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,14 @@ def load_data(project_id: str, gzipped_csv_file: str) -> list:
1919
project_data = []
2020
with gzip.open(gzipped_csv_file, mode="rt") as f:
2121
reader = csv.reader(f, delimiter=",")
22+
column_index_map = {}
2223

2324
for i, row in enumerate(reader):
2425
if i == 0:
2526
# skip header
27+
column_index_map = {
28+
column_label: index for index, column_label in enumerate(row)
29+
}
2630
continue
2731

2832
# the last row of the csv might contain a comment about data use
@@ -42,14 +46,15 @@ def load_data(project_id: str, gzipped_csv_file: str) -> list:
4246
"task_x": task_x,
4347
"task_y": task_y,
4448
"task_z": task_z,
45-
"no_count": int(row[2]),
46-
"yes_count": int(row[3]),
47-
"maybe_count": int(row[4]),
48-
"bad_imagery_count": int(row[5]),
49-
"no_share": float(row[7]),
50-
"yes_share": float(row[8]),
51-
"maybe_share": float(row[9]),
52-
"bad_imagery_share": float(row[10]),
49+
# XXX: Assuming 0->No, 1->Yes, 2->Maybe, 3->Bad
50+
"no_count": int(column_index_map.get("0_count", 0)),
51+
"yes_count": int(column_index_map.get("1_count", 0)),
52+
"maybe_count": int(column_index_map.get("2_count", 0)),
53+
"bad_imagery_count": int(column_index_map.get("3_count", 0)),
54+
"no_share": float(column_index_map.get("0_count", 0)),
55+
"yes_share": float(column_index_map.get("1_count", 0)),
56+
"maybe_share": float(column_index_map.get("2_count", 0)),
57+
"bad_imagery_share": float(column_index_map.get("3_count", 0)),
5358
"wkt": tile_functions.geometry_from_tile_coords(
5459
task_x, task_y, task_z
5560
),

0 commit comments

Comments
 (0)