diff --git a/nomad/metrics/metrics.py b/nomad/metrics/metrics.py index 57865538..7d4a6e98 100644 --- a/nomad/metrics/metrics.py +++ b/nomad/metrics/metrics.py @@ -149,7 +149,7 @@ def _group_centroid(g): w = None return _centroid(pts, metric=metric, weight=w) - cent = stops.groupby(keys).apply(_group_centroid, include_groups=False) + cent = stops.groupby(keys).apply(_group_centroid) cent_df = pd.DataFrame( cent.tolist(), @@ -185,5 +185,169 @@ def _group_rog(g): else: return np.sqrt(g['d2'].mean()) - rog = stops.groupby(keys).apply(_group_rog, include_groups=False) - return rog.reset_index(name='rog') \ No newline at end of file + rog = stops.groupby(keys).apply(_group_rog) + return rog.reset_index(name='rog') + +def self_containment(stops, threshold, agg_freq='d', weighted=True, home_activity_type='home', + traj_cols=None, time_weights=None, + exploded=True, **kwargs): + """ + Compute self-containment (proportion of non-home time spent within threshold distance from home). + + Self-containment describes the propensity of individuals to stay close to home. It is calculated + as the time-weighted proportion of non-home activities that are within a threshold distance from home. + + Parameters + ---------- + stops : pd.DataFrame + Stop data with spatial coordinates, duration, and location_id. + threshold : float + Distance threshold in the same units as coordinates (meters for projected, degrees for lat/lon). + Activities within this distance from home are considered "contained". + agg_freq : str + Pandas offset alias for time-bucketing (e.g. 'd','w','m'). + weighted : bool + If True, weight by duration; else unweighted (count activities). + home_activity_type : str + Value in location_id column that identifies home locations. Default is 'home'. + Can be 'home_id' or any other location_id value. + traj_cols : dict, optional + Mapping for x/y (or lon/lat), timestamp/datetime, duration, user_id, location_id. + time_weights : pd.Series, optional + Additional time weights to multiply with duration (if weighted=True). + exploded : bool + If True, explode stops that straddle multiple time periods. Default is True. + **kwargs + Additional arguments passed to explode_stops or column overrides. + + Returns + ------- + pd.DataFrame + Columns = [period, user_id?, self_containment]. + self_containment is the proportion [0, 1] of non-home time spent within threshold from home. + """ + stops = stops.copy() + + # Restrict agg_freq to days and weeks only + allowed_freqs = ['d', 'w', 'D', 'W'] + if agg_freq not in allowed_freqs: + raise ValueError(f"agg_freq must be one of {allowed_freqs} (got '{agg_freq}')") + + # Parse column mappings (similar to compute_candidate_homes) + traj_cols = loader._parse_traj_cols(stops.columns, traj_cols, kwargs) + + # Check for required location_id column + if traj_cols["location_id"] not in stops.columns: + raise ValueError(f"Missing required '{traj_cols['location_id']}' column") + + # Warn if no home locations exist in the entire dataset + if (stops[traj_cols["location_id"]] == home_activity_type).sum() == 0: + warnings.warn( + f"No home locations found (location_id == '{home_activity_type}'). " + f"Self-containment cannot be calculated without home locations.", + UserWarning + ) + + # Add time_weights column if provided + if time_weights is not None: + if isinstance(time_weights, pd.Series) and (len(time_weights) == len(stops)): + stops['time_weights'] = time_weights + else: + raise ValueError("time_weights must be a pd.Series with the same length and index as stops.") + + if exploded: + stops = utils.explode_stops(stops, agg_freq=agg_freq, **kwargs) + warnings.warn( + f"Some stops straddle multiple {agg_freq.upper()}s. They will be exploded into separate rows.", + UserWarning + ) + + # 1) Column mapping + check + t_key, coord_x, coord_y, use_datetime, use_lon_lat = utils._fallback_st_cols(stops.columns, traj_cols, kwargs) + dur_key = traj_cols['duration'] + if dur_key not in stops.columns: + raise ValueError("Missing required 'duration' column") + + # 2) Time buckets + if use_datetime: + temp_dt = stops[traj_cols[t_key]] + else: + temp_dt = pd.to_datetime(stops[traj_cols[t_key]], unit='s') + if agg_freq == "W": + agg_freq = "W-MON" + + stops['period'] = temp_dt.dt.to_period(agg_freq).dt.start_time + + # 3) Grouping keys + keys = ['period'] + uid_key = traj_cols['user_id'] + if uid_key in stops.columns: + keys.append(uid_key) + + # 4) Calculate distance from home for each group + metric = 'haversine' if use_lon_lat else 'euclidean' + + # Initialize distance column + stops['dist_from_home'] = np.nan + + # Calculate for each group + for group_keys_tuple, group_df in stops.groupby(keys): + home_stops = group_df[group_df[traj_cols["location_id"]] == home_activity_type] + + if len(home_stops) == 0: + # No home location in this group + continue + + # Use the first home location as reference + if metric == 'haversine': + home_coords = home_stops[[traj_cols['latitude'], traj_cols['longitude']]].iloc[0].values + home_coords_rad = np.radians(home_coords) + + # Calculate distance for each point in group + for idx, row in group_df.iterrows(): + point_coords = np.radians([row[traj_cols['latitude']], row[traj_cols['longitude']]]) + stops.loc[idx, 'dist_from_home'] = utils._haversine_distance(point_coords, home_coords_rad) + else: + home_coords = home_stops[[coord_x, coord_y]].iloc[0].values + dx = group_df[coord_x] - home_coords[0] + dy = group_df[coord_y] - home_coords[1] + distances = np.sqrt(dx*dx + dy*dy) + stops.loc[group_df.index, 'dist_from_home'] = distances.values + + # 5) Calculate self-containment per group + def _group_self_containment(g): + """Calculate self-containment for a group (time period + user).""" + # If no activities at all, return NaN + if len(g) == 0: + return np.nan + + # Filter for non-home activities + non_home = g[g[traj_cols["location_id"]] != home_activity_type] + + if len(non_home) == 0: + return 1.0 # No non-home activities = perfectly contained at home + + # Check which are within threshold + within_threshold = non_home['dist_from_home'] <= threshold + + if weighted: + # Calculate weights + if time_weights is not None: + weights = non_home[dur_key] * time_weights.loc[non_home.index] + elif 'time_weights' in non_home.columns: + weights = non_home[dur_key] * non_home['time_weights'] + else: + weights = non_home[dur_key] + + # Time-weighted proportion + total_weight = weights.sum() + if total_weight == 0: + return np.nan + within_weight = (weights * within_threshold).sum() + return within_weight / total_weight + else: + # Unweighted proportion (count of activities) + return within_threshold.sum() / len(non_home) + + result = stops.groupby(keys).apply(_group_self_containment) + return result.reset_index(name='self_containment') \ No newline at end of file diff --git a/nomad/tests/test_metrics.py b/nomad/tests/test_metrics.py index fe7c10b8..ea91c761 100644 --- a/nomad/tests/test_metrics.py +++ b/nomad/tests/test_metrics.py @@ -1,7 +1,7 @@ import pytest import pandas as pd import numpy as np -from nomad.metrics.metrics import rog +from nomad.metrics.metrics import rog, self_containment from pathlib import Path from nomad.io import base as loader from nomad.stop_detection import lachesis as LACHESIS @@ -105,3 +105,243 @@ def test_rog_single_stop_straddling_days_and_weeks(): assert len(out_weekly) == 2 +def test_self_containment_basic(): + """Test basic self-containment calculation with simple data.""" + # Create test data with home at (0, 0) and activities at various distances + stops = pd.DataFrame({ + 'x': [0, 10, 20, 30], # Home at 0, others at increasing distances + 'y': [0, 0, 0, 0], + 'duration': [60, 30, 30, 30], # 60 min home, 30 min each for others + 'start_timestamp': pd.to_datetime([ + '2024-01-01 00:00', + '2024-01-01 01:00', + '2024-01-01 02:00', + '2024-01-01 03:00' + ]), + 'end_timestamp': pd.to_datetime([ + '2024-01-01 01:00', + '2024-01-01 01:30', + '2024-01-01 02:30', + '2024-01-01 03:30' + ]), + 'location_id': ['home', 'work', 'shopping', 'restaurant'], + 'user_id': [1, 1, 1, 1] + }) + + # Threshold of 15 meters - should capture Work (10m) but not Shopping (20m) or Restaurant (30m) + result = self_containment( + stops, + threshold=15, + agg_freq='d', + weighted=True, + traj_cols={'x': 'x', 'y': 'y', 'duration': 'duration', 'user_id': 'user_id', 'location_id': 'location_id'}, + start_col='start_timestamp', + end_col='end_timestamp', + use_datetime=True, + exploded=False + ) + + # Only Work (30 min) is within threshold out of 90 min total non-home time + # Expected: 30/90 = 0.333... + assert len(result) == 1 + assert np.allclose(result['self_containment'].values[0], 30/90) + +def test_self_containment_unweighted(): + """Test unweighted self-containment (count of activities).""" + stops = pd.DataFrame({ + 'x': [0, 10, 20, 30], + 'y': [0, 0, 0, 0], + 'duration': [60, 10, 50, 30], # Different durations + 'start_timestamp': pd.to_datetime([ + '2024-01-01 00:00', + '2024-01-01 01:00', + '2024-01-01 02:00', + '2024-01-01 03:00' + ]), + 'end_timestamp': pd.to_datetime([ + '2024-01-01 01:00', + '2024-01-01 01:10', + '2024-01-01 02:50', + '2024-01-01 03:30' + ]), + 'location_id': ['home', 'work', 'shopping', 'restaurant'], + 'user_id': [1, 1, 1, 1] + }) + + result = self_containment( + stops, + threshold=15, + agg_freq='d', + weighted=False, # Unweighted + traj_cols={'x': 'x', 'y': 'y', 'duration': 'duration', 'user_id': 'user_id', 'location_id': 'location_id'}, + start_col='start_timestamp', + end_col='end_timestamp', + use_datetime=True, + exploded=False + ) + + # 1 out of 3 non-home activities is within threshold + assert len(result) == 1 + assert np.allclose(result['self_containment'].values[0], 1/3) + +def test_self_containment_multi_user(): + """Test self-containment with multiple users.""" + stops = pd.DataFrame({ + 'x': [0, 5, 0, 50], # User 1: home at 0, work at 5; User 2: home at 0, work at 50 + 'y': [0, 0, 0, 0], + 'duration': [60, 30, 60, 30], + 'start_timestamp': pd.to_datetime([ + '2024-01-01 00:00', + '2024-01-01 01:00', + '2024-01-01 00:00', + '2024-01-01 01:00' + ]), + 'end_timestamp': pd.to_datetime([ + '2024-01-01 01:00', + '2024-01-01 01:30', + '2024-01-01 01:00', + '2024-01-01 01:30' + ]), + 'location_id': ['home', 'work', 'home', 'work'], + 'user_id': [1, 1, 2, 2] + }) + + result = self_containment( + stops, + threshold=10, # User 1's work is within, User 2's is not + agg_freq='d', + weighted=True, + traj_cols={'x': 'x', 'y': 'y', 'duration': 'duration', 'user_id': 'user_id', 'location_id': 'location_id'}, + start_col='start_timestamp', + end_col='end_timestamp', + use_datetime=True, + exploded=False + ) + + assert len(result) == 2 + # User 1: all non-home time (30 min) is within threshold + user1_result = result[result['user_id'] == 1]['self_containment'].values[0] + assert np.allclose(user1_result, 1.0) + + # User 2: no non-home time is within threshold + user2_result = result[result['user_id'] == 2]['self_containment'].values[0] + assert np.allclose(user2_result, 0.0) + +def test_self_containment_no_home(): + """Test self-containment when there's no home location.""" + stops = pd.DataFrame({ + 'x': [10, 20, 30], + 'y': [0, 0, 0], + 'duration': [30, 30, 30], + 'start_timestamp': pd.to_datetime([ + '2024-01-01 01:00', + '2024-01-01 02:00', + '2024-01-01 03:00' + ]), + 'end_timestamp': pd.to_datetime([ + '2024-01-01 01:30', + '2024-01-01 02:30', + '2024-01-01 03:30' + ]), + 'location_id': ['work', 'shopping', 'restaurant'], + 'user_id': [1, 1, 1] + }) + + # Should trigger a warning about no home locations + with pytest.warns(UserWarning, match="No home locations found"): + result = self_containment( + stops, + threshold=15, + agg_freq='d', + weighted=True, + traj_cols={'x': 'x', 'y': 'y', 'duration': 'duration', 'user_id': 'user_id', 'location_id': 'location_id'}, + start_col='start_timestamp', + end_col='end_timestamp', + use_datetime=True, + exploded=False + ) + + # Should return 0.0 when there's no home location (no activities within threshold of non-existent home) + assert len(result) == 1 + assert np.allclose(result['self_containment'].values[0], 0.0) + +def test_self_containment_all_home(): + """Test self-containment when all activities are at home.""" + stops = pd.DataFrame({ + 'x': [0, 0, 0], + 'y': [0, 0, 0], + 'duration': [30, 30, 30], + 'start_timestamp': pd.to_datetime([ + '2024-01-01 01:00', + '2024-01-01 02:00', + '2024-01-01 03:00' + ]), + 'end_timestamp': pd.to_datetime([ + '2024-01-01 01:30', + '2024-01-01 02:30', + '2024-01-01 03:30' + ]), + 'location_id': ['home', 'home', 'home'], + 'user_id': [1, 1, 1] + }) + + result = self_containment( + stops, + threshold=15, + agg_freq='d', + weighted=True, + traj_cols={'x': 'x', 'y': 'y', 'duration': 'duration', 'user_id': 'user_id', 'location_id': 'location_id'}, + start_col='start_timestamp', + end_col='end_timestamp', + use_datetime=True, + exploded=False + ) + + # Should return 1.0 when there are no non-home activities (perfectly contained) + assert len(result) == 1 + assert np.allclose(result['self_containment'].values[0], 1.0) + +def test_self_containment_with_time_weights(): + """Test self-containment with additional time weights.""" + stops = pd.DataFrame({ + 'x': [0, 10, 20, 30], + 'y': [0, 0, 0, 0], + 'duration': [60, 30, 30, 30], + 'start_timestamp': pd.to_datetime([ + '2024-01-01 00:00', + '2024-01-01 01:00', + '2024-01-01 02:00', + '2024-01-01 03:00' + ]), + 'end_timestamp': pd.to_datetime([ + '2024-01-01 01:00', + '2024-01-01 01:30', + '2024-01-01 02:30', + '2024-01-01 03:30' + ]), + 'location_id': ['home', 'work', 'shopping', 'restaurant'], + 'user_id': [1, 1, 1, 1] + }) + + # Create time weights that double the weight of the first activity + time_weights = pd.Series([1.0, 2.0, 1.0, 1.0], index=stops.index) + + result = self_containment( + stops, + threshold=15, + agg_freq='d', + weighted=True, + time_weights=time_weights, + traj_cols={'x': 'x', 'y': 'y', 'duration': 'duration', 'user_id': 'user_id', 'location_id': 'location_id'}, + start_col='start_timestamp', + end_col='end_timestamp', + use_datetime=True, + exploded=False + ) + + # Work (30 min * 2.0) = 60 weighted minutes within threshold + # Total non-home: Work (60) + Shopping (30) + Restaurant (30) = 120 weighted minutes + # Expected: 60/120 = 0.5 + assert len(result) == 1 + assert np.allclose(result['self_containment'].values[0], 60/120) +