Skip to content

Commit 42d7143

Browse files
committed
1 parent 14a5fa0 commit 42d7143

File tree

3 files changed

+124
-93
lines changed

3 files changed

+124
-93
lines changed

mapswipe_workers/mapswipe_workers/generate_stats/overall_stats.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,19 @@ def get_project_static_info(filename: str) -> pd.DataFrame:
6565
,regexp_replace(look_for, E'[\\n\\r]+', ' ', 'g' ) as look_for
6666
,project_type
6767
,image
68+
-- Custom options values
69+
,CASE
70+
WHEN project_type_specifics->'customOptions' IS NOT NULL
71+
THEN -- thus if we have answer labels use them
72+
ARRAY(
73+
SELECT json_array_elements(
74+
project_type_specifics->'customOptions'
75+
)->>'value'
76+
)
77+
ELSE -- otherwise use below label range as the mapswipe app default
78+
'{0,1,2,3}'
79+
END as custom_options_values
80+
-- custom_options_values -> parent - child relation
6881
-- add an array of the tile server names
6982
,CASE
7083
WHEN project_type_specifics->'tileServer'->'name' IS NOT NULL THEN

mapswipe_workers/mapswipe_workers/generate_stats/project_stats.py

Lines changed: 65 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import ast
12
import datetime
23
import gzip
34
import json
@@ -238,76 +239,45 @@ def get_groups(filename: str, project_id: str) -> pd.DataFrame:
238239
return df
239240

240241

241-
def calc_agreement(total: int, no: int, yes: int, maybe: int, bad: int) -> float:
242+
def calc_agreement(row: pd.Series) -> float:
242243
"""
243-
for each task the "agreement" is computed as defined by Scott's Pi
244-
Scott's Pi is a measure for inter-rater reliability
245-
https://en.wikipedia.org/wiki/Scott%27s_Pi
244+
for each task the "agreement" is computed (i.e. the extent to which
245+
raters agree for the i-th subject). This measure is a component of
246+
Fleiss' kappa: https://en.wikipedia.org/wiki/Fleiss%27_kappa
246247
"""
247248

248-
# TODO: currently this is implemented only for the 4 given categories
249+
# Calculate total count as the sum of all categories
250+
n = row["total_count"]
249251

250-
if total == 1:
251-
agreement = 1.0
252+
row = row.drop(labels=["total_count"])
253+
# extent to which raters agree for the ith subject
254+
# set agreement to None if only one user contributed
255+
if n == 1 or n == 0:
256+
agreement = None
252257
else:
253-
agreement = (
254-
1.0
255-
/ (total * (total - 1))
256-
* (
257-
(no * (no - 1))
258-
+ (yes * (yes - 1))
259-
+ (maybe * (maybe - 1))
260-
+ (bad * (bad - 1))
261-
)
262-
)
258+
agreement = (sum([i**2 for i in row]) - n) / (n * (n - 1))
263259

264260
return agreement
265261

266262

267-
def calc_share(total: int, no: int, yes: int, maybe: int, bad: int) -> List[float]:
263+
def calc_share(df: pd.DataFrame) -> pd.DataFrame:
268264
"""Calculate the share of each category on the total count."""
269-
no_share = no / total
270-
yes_share = yes / total
271-
maybe_share = maybe / total
272-
bad_share = bad / total
273-
274-
return [no_share, yes_share, maybe_share, bad_share]
275-
276-
277-
def calc_count(row) -> List[int]:
278-
"""
279-
Check if a count exists for each category ("no", "yes", "maybe", "bad").
280-
Then calculate total count as the sum of all categories.
281-
"""
282-
283-
try:
284-
no_count = row[0]
285-
except KeyError:
286-
no_count = 0
287-
288-
try:
289-
yes_count = row[1]
290-
except KeyError:
291-
yes_count = 0
292-
293-
try:
294-
maybe_count = row[2]
295-
except KeyError:
296-
maybe_count = 0
297-
298-
try:
299-
bad_count = row[3]
300-
except KeyError:
301-
bad_count = 0
265+
share_df = df.filter(like="count").div(df.total_count, axis=0)
266+
share_df.drop("total_count", inplace=True, axis=1)
267+
share_df.columns = share_df.columns.str.replace("_count", "_share")
268+
return df.join(share_df)
302269

303-
total_count = no_count + yes_count + maybe_count + bad_count
304-
assert total_count > 0, "Total count for result must be bigger than zero."
305270

306-
return [total_count, no_count, yes_count, maybe_count, bad_count]
271+
def calc_count(df: pd.DataFrame) -> pd.DataFrame:
272+
df_new = df.filter(like="count")
273+
df_new_sum = df_new.sum(axis=1)
274+
return df_new_sum
307275

308276

309-
def calc_quadkey(row):
277+
def calc_quadkey(row: pd.DataFrame):
310278
"""Calculate quadkey based on task id."""
279+
# TODO: This does not make sense for media type, digtitalization.
280+
# For these projects types we should move to project classes.
311281
try:
312282
tile_z, tile_x, tile_y = row["task_id"].split("-")
313283
quadkey = tile_functions.tile_coords_and_zoom_to_quadKey(
@@ -320,8 +290,26 @@ def calc_quadkey(row):
320290
return quadkey
321291

322292

293+
def add_missing_result_columns(
294+
df: pd.DataFrame,
295+
custom_options_values: pd.Series
296+
) -> pd.DataFrame:
297+
"""
298+
Check if all possible answers columns are included in the grouped results
299+
data frame and add columns if missing.
300+
"""
301+
302+
all_answer_label_values_list = list(
303+
ast.literal_eval(custom_options_values.item())
304+
)
305+
df = df.reindex(columns=all_answer_label_values_list, fill_value=0)
306+
return df
307+
308+
323309
def get_agg_results_by_task_id(
324-
results_df: pd.DataFrame, tasks_df: pd.DataFrame
310+
results_df: pd.DataFrame,
311+
tasks_df: pd.DataFrame,
312+
custom_options_values: pd.Series,
325313
) -> pd.DataFrame:
326314
"""
327315
For each task several users contribute results.
@@ -339,6 +327,7 @@ def get_agg_results_by_task_id(
339327
----------
340328
results_df: pd.DataFrame
341329
tasks_df: pd.DataFrame
330+
custom_options_values: pd.Series
342331
"""
343332

344333
results_by_task_id_df = (
@@ -347,23 +336,27 @@ def get_agg_results_by_task_id(
347336
.unstack(fill_value=0)
348337
)
349338

350-
# calculate total count and check if other counts are defined
351-
results_by_task_id_df[["total_count", 0, 1, 2, 3]] = results_by_task_id_df.apply(
352-
lambda row: calc_count(row), axis=1, result_type="expand"
339+
# add columns for answer options that were not chosen for any task
340+
results_by_task_id_df = add_missing_result_columns(
341+
results_by_task_id_df,
342+
custom_options_values,
353343
)
354344

345+
# TODO: Add logic for parent values using sub values
346+
# [<parent_value> = <parent_value> + <child_1_value> + .. <child_N_value>]
347+
348+
# needed for ogr2ogr todo: might be legacy?
349+
results_by_task_id_df = results_by_task_id_df.add_suffix("_count")
350+
351+
# calculate total count of votes per task
352+
results_by_task_id_df["total_count"] = calc_count(results_by_task_id_df)
353+
355354
# calculate share based on counts
356-
results_by_task_id_df[
357-
["0_share", "1_share", "2_share", "3_share"]
358-
] = results_by_task_id_df.apply(
359-
lambda row: calc_share(row["total_count"], row[0], row[1], row[2], row[3]),
360-
axis=1,
361-
result_type="expand",
362-
)
355+
results_by_task_id_df = calc_share(results_by_task_id_df)
363356

364357
# calculate agreement
365358
results_by_task_id_df["agreement"] = results_by_task_id_df.apply(
366-
lambda row: calc_agreement(row["total_count"], row[0], row[1], row[2], row[3]),
359+
calc_agreement,
367360
axis=1,
368361
)
369362
logger.info("calculated agreement")
@@ -383,11 +376,6 @@ def get_agg_results_by_task_id(
383376
)
384377
logger.info("added geometry to aggregated results")
385378

386-
# rename columns, ogr2ogr will fail otherwise
387-
agg_results_df.rename(
388-
columns={0: "0_count", 1: "1_count", 2: "2_count", 3: "3_count"}, inplace=True
389-
)
390-
391379
return agg_results_df
392380

393381

@@ -430,7 +418,11 @@ def get_per_project_statistics(project_id: str, project_info: pd.Series) -> dict
430418
add_metadata = False
431419

432420
# aggregate results by task id
433-
agg_results_df = get_agg_results_by_task_id(results_df, tasks_df)
421+
agg_results_df = get_agg_results_by_task_id(
422+
results_df,
423+
tasks_df,
424+
project_info["custom_options_values"],
425+
)
434426
agg_results_df.to_csv(agg_results_filename, index_label="idx")
435427

436428
geojson_functions.gzipped_csv_to_gzipped_geojson(

mapswipe_workers/mapswipe_workers/generate_stats/user_stats.py

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,40 @@
11
import pandas as pd
22

33

4+
def get_agreeing_contributions_per_user_and_task(row):
5+
"""
6+
Compare user contibution to classifications of other users by calculating
7+
the number of agreeing and disagreeing results.
8+
"""
9+
10+
# XXX: We need to figure what which values to check? Parent or child
11+
r = row["result"]
12+
count_str = f"{r}_count"
13+
# ignore -999 values
14+
if count_str == "-999_count":
15+
return 0
16+
else:
17+
return row[count_str] - 1
18+
19+
20+
def get_disagreeing_contributions_per_user_and_task(row):
21+
"""
22+
Compare user contibution to classifications of other users by calculating
23+
the number of agreeing and disagreeing results.
24+
"""
25+
26+
total_count = row["total_count"]
27+
if total_count == 0:
28+
return 0
29+
else:
30+
agreeing_contributions = row["agreeing_contributions"]
31+
disagreeing_contributions = total_count - (agreeing_contributions + 1)
32+
return disagreeing_contributions
33+
34+
435
def get_agg_results_by_user_id(
5-
results_df: pd.DataFrame, agg_results_df: pd.DataFrame
36+
results_df: pd.DataFrame,
37+
agg_results_df: pd.DataFrame
638
) -> pd.DataFrame:
739
"""
840
For each users we calcuate the number of total contributions (tasks)
@@ -15,26 +47,20 @@ def get_agg_results_by_user_id(
1547
Returns a pandas dataframe.
1648
"""
1749
raw_contributions_df = results_df.merge(
18-
agg_results_df, left_on="task_id", right_on="task_id"
50+
agg_results_df,
51+
left_on="task_id",
52+
right_on="task_id",
53+
)
54+
55+
raw_contributions_df["agreeing_contributions"] = raw_contributions_df.apply(
56+
get_agreeing_contributions_per_user_and_task,
57+
axis=1,
58+
)
59+
60+
raw_contributions_df["disagreeing_contributions"] = raw_contributions_df.apply(
61+
get_disagreeing_contributions_per_user_and_task,
62+
axis=1,
1963
)
20-
# compare to classifications of other users
21-
# Calc number of agreeig and disagreeing results from other users.
22-
raw_contributions_df.loc[
23-
raw_contributions_df["result"] == 0, "agreeing_contributions"
24-
] = (raw_contributions_df["0_count"] - 1)
25-
raw_contributions_df.loc[
26-
raw_contributions_df["result"] == 1, "agreeing_contributions"
27-
] = (raw_contributions_df["1_count"] - 1)
28-
raw_contributions_df.loc[
29-
raw_contributions_df["result"] == 2, "agreeing_contributions"
30-
] = (raw_contributions_df["2_count"] - 1)
31-
raw_contributions_df.loc[
32-
raw_contributions_df["result"] == 3, "agreeing_contributions"
33-
] = (raw_contributions_df["3_count"] - 1)
34-
35-
raw_contributions_df["disagreeing_contributions"] = raw_contributions_df[
36-
"total_count"
37-
] - (raw_contributions_df["agreeing_contributions"] + 1)
3864

3965
agg_results_by_user_id_df = raw_contributions_df.groupby(
4066
["project_id", "user_id", "username"]

0 commit comments

Comments
 (0)