Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
dbff470
Investigate analysis of events at sim level
marghe-molaro Apr 3, 2024
bf64628
Merge branch 'master' into molaro/harvest-training-data
marghe-molaro Sep 17, 2024
05098f7
Final data-printing set-up
marghe-molaro Sep 30, 2024
16c071c
Print event chains
marghe-molaro Oct 2, 2024
ba81487
Add chains in mode 2 too and clean up in simuation
marghe-molaro Oct 2, 2024
0474624
Merged with master, and moved all logging into event module to keep t…
marghe-molaro Oct 2, 2024
b1c907c
Fix issue with tests by ensuring standard Polling and infection is ma…
marghe-molaro Oct 7, 2024
cfb4264
Switch iloc for loc
marghe-molaro Oct 7, 2024
e0327de
Change syntax of if statement
marghe-molaro Oct 7, 2024
fceee02
Change syntax of if statement and print string of event
marghe-molaro Oct 9, 2024
eaeae62
Focus on rti and print footprint
marghe-molaro Oct 10, 2024
c7bd9d0
Only store change in individual properties, not entire property row. …
marghe-molaro Oct 11, 2024
769aaec
Style fixes
marghe-molaro Oct 11, 2024
757cee3
Include printing of individual properties at the beginning and at bir…
marghe-molaro Oct 13, 2024
22a5e44
Log everything to simulation, as events logger doesn't seem to be vis…
marghe-molaro Oct 16, 2024
7faa817
Consider all modules included as of interest
marghe-molaro Oct 18, 2024
7232f97
Remove pop-wide HSI warning and make epi default even when printing c…
marghe-molaro Oct 18, 2024
98a8832
Merge branch 'master' into molaro/harvest-training-data
marghe-molaro Oct 18, 2024
a6def2d
Style fix
marghe-molaro Oct 18, 2024
ecea532
Remove data generation test, which wasn't really a test
marghe-molaro Oct 18, 2024
ae7a44c
Change dict of properties to string in logging, and add analysis files
marghe-molaro Oct 23, 2024
16299a2
Include debugging option, final set-up of scenario to print data, ana…
marghe-molaro Nov 25, 2024
0dd862f
Change label of person when iterating
marghe-molaro Nov 26, 2024
0e7dc99
Merge branch 'master' into molaro/harvest-training-data
marghe-molaro Dec 9, 2024
84f8263
Correctly retrieve event name
marghe-molaro Dec 13, 2024
a490d19
Modify scenario file such that can exclude specific services, and cor…
marghe-molaro Jan 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
528 changes: 528 additions & 0 deletions src/scripts/analysis_data_generation/analysis_extract_data.py

Large diffs are not rendered by default.

156 changes: 156 additions & 0 deletions src/scripts/analysis_data_generation/postprocess_events_chain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import pandas as pd
from dateutil.relativedelta import relativedelta

# Remove from every individual's event chain all events that were fired after death
def cut_off_events_after_death(df):

events_chain = df.groupby('person_ID')

filtered_data = pd.DataFrame()

for name, group in events_chain:

# Find the first non-NaN 'date_of_death' and its index
first_non_nan_index = group['date_of_death'].first_valid_index()

if first_non_nan_index is not None:
# Filter out all rows after the first non-NaN index
filtered_group = group.loc[:first_non_nan_index] # Keep rows up to and including the first valid index
filtered_data = pd.concat([filtered_data, filtered_group])
else:
# If there are no non-NaN values, keep the original group
filtered_data = pd.concat([filtered_data, group])

return filtered_data

# Load into DataFrame
def load_csv_to_dataframe(file_path):
try:
# Load raw chains into df
df = pd.read_csv(file_path)
print("Raw event chains loaded successfully!")
return df
except FileNotFoundError:
print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
print(f"An error occurred: {e}")

file_path = 'output.csv' # Replace with the path to your CSV file

output = load_csv_to_dataframe(file_path)

# Some of the dates appeared not to be in datetime format. Correct here.
output['date_of_death'] = pd.to_datetime(output['date_of_death'], errors='coerce')
output['date_of_birth'] = pd.to_datetime(output['date_of_birth'], errors='coerce')
if 'hv_date_inf' in output.columns:
output['hv_date_inf'] = pd.to_datetime(output['hv_date_inf'], errors='coerce')


date_start = pd.to_datetime('2010-01-01')
if 'Other' in output['cause_of_death'].values:
print("ERROR: 'Other' was included in sim as possible cause of death")
exit(-1)

# Choose which columns in individual properties to visualise
columns_to_print =['event','is_alive','hv_inf', 'hv_art','tb_inf', 'tb_date_active', 'event_date', 'when']
#columns_to_print =['person_ID', 'date_of_birth', 'date_of_death', 'cause_of_death','hv_date_inf', 'hv_art','tb_inf', 'tb_date_active', 'event date', 'event']

# When checking which individuals led to *any* changes in individual properties, exclude these columns from comparison
columns_to_exclude_in_comparison = ['when', 'event', 'event_date', 'age_exact_years', 'age_years', 'age_days', 'age_range', 'level', 'appt_footprint']

# If considering epidemiology consistent with sim, add check here.
check_ages_of_those_HIV_inf = False
if check_ages_of_those_HIV_inf:
for index, row in output.iterrows():
if pd.isna(row['hv_date_inf']):
continue # Skip this iteration
diff = relativedelta(output.loc[index, 'hv_date_inf'],output.loc[index, 'date_of_birth'])
if diff.years > 1 and diff.years<15:
print("Person contracted HIV infection at age younger than 15", diff)

# Remove events after death
filtered_data = cut_off_events_after_death(output)

print_raw_events = True # Print raw chain of events for each individual
print_selected_changes = False
print_all_changes = True
person_ID_of_interest = 494

pd.set_option('display.max_rows', None)

for name, group in filtered_data.groupby('person_ID'):
list_of_dob = group['date_of_birth']

# Select individuals based on when they were born
if list_of_dob.iloc[0].year<2010:

# Check that immutable properties are fixed for this individual, i.e. that events were collated properly:
all_identical_dob = group['date_of_birth'].nunique() == 1
all_identical_sex = group['sex'].nunique() == 1
if all_identical_dob is False or all_identical_sex is False:
print("Immutable properties are changing! This is not chain for single individual")
print(group)
exit(-1)

print("----------------------------------------------------------------------")
print("person_ID ", group['person_ID'].iloc[0], "d.o.b ", group['date_of_birth'].iloc[0])
print("Number of events for this individual ", group['person_ID'].iloc[0], "is :", len(group)/2) # Divide by 2 before printing Before/After for each event
number_of_events =len(group)/2
number_of_changes=0
if print_raw_events:
print(group)

if print_all_changes:
# Check each row
comparison = group.drop(columns=columns_to_exclude_in_comparison).fillna(-99999).ne(group.drop(columns=columns_to_exclude_in_comparison).shift().fillna(-99999))

# Iterate over rows where any column has changed
for idx, row_changed in comparison.iloc[1:].iterrows():
if row_changed.any(): # Check if any column changed in this row
number_of_changes+=1
changed_columns = row_changed[row_changed].index.tolist() # Get the columns where changes occurred
print(f"Row {idx} - Changes detected in columns: {changed_columns}")
columns_output = ['event', 'event_date', 'appt_footprint', 'level'] + changed_columns
print(group.loc[idx, columns_output]) # Print only the changed columns
if group.loc[idx, 'when'] == 'Before':
print('-----> THIS CHANGE OCCURRED BEFORE EVENT!')
#print(group.loc[idx,columns_to_print])
print() # For better readability
print("Number of changes is ", number_of_changes, "out of ", number_of_events, " events")

if print_selected_changes:
tb_inf_condition = (
((group['tb_inf'].shift(1) == 'uninfected') & (group['tb_inf'] == 'active')) |
((group['tb_inf'].shift(1) == 'latent') & (group['tb_inf'] == 'active')) |
((group['tb_inf'].shift(1) == 'active') & (group['tb_inf'] == 'latent')) |
((group['hv_inf'].shift(1) is False) & (group['hv_inf'] is True)) |
((group['hv_art'].shift(1) == 'not') & (group['hv_art'] == 'on_not_VL_suppressed')) |
((group['hv_art'].shift(1) == 'not') & (group['hv_art'] == 'on_VL_suppressed')) |
((group['hv_art'].shift(1) == 'on_VL_suppressed') & (group['hv_art'] == 'on_not_VL_suppressed')) |
((group['hv_art'].shift(1) == 'on_VL_suppressed') & (group['hv_art'] == 'not')) |
((group['hv_art'].shift(1) == 'on_not_VL_suppressed') & (group['hv_art'] == 'on_VL_suppressed')) |
((group['hv_art'].shift(1) == 'on_not_VL_suppressed') & (group['hv_art'] == 'not'))
)

alive_condition = (
(group['is_alive'].shift(1) is True) & (group['is_alive'] is False)
)
# Combine conditions for rows of interest
transition_condition = tb_inf_condition | alive_condition

if list_of_dob.iloc[0].year >= 2010:
print("DETECTED OF INTEREST")
print(group[group['event'] == 'Birth'][columns_to_print])

# Filter the DataFrame based on the condition
filtered_transitions = group[transition_condition]
if not filtered_transitions.empty:
if list_of_dob.iloc[0].year < 2010:
print("DETECTED OF INTEREST")
print(filtered_transitions[columns_to_print])


print("Number of individuals simulated ", filtered_data.groupby('person_ID').ngroups)



180 changes: 180 additions & 0 deletions src/scripts/analysis_data_generation/scenario_generate_chains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
"""This Scenario file run the model to generate event chans

Run on the batch system using:
```
tlo batch-submit
src/scripts/analysis_data_generation/scenario_generate_chains.py
```

or locally using:
```
tlo scenario-run src/scripts/analysis_data_generation/scenario_generate_chains.py
```

"""
from pathlib import Path
from typing import Dict

import pandas as pd

from tlo import Date, logging
from tlo.analysis.utils import get_parameters_for_status_quo, mix_scenarios, get_filtered_treatment_ids
from tlo.methods.fullmodel import fullmodel
from tlo.methods.scenario_switcher import ImprovedHealthSystemAndCareSeekingScenarioSwitcher
from tlo.scenario import BaseScenario
from tlo.methods import (
alri,
cardio_metabolic_disorders,
care_of_women_during_pregnancy,
contraception,
demography,
depression,
diarrhoea,
enhanced_lifestyle,
epi,
healthburden,
healthseekingbehaviour,
healthsystem,
hiv,
rti,
labour,
malaria,
newborn_outcomes,
postnatal_supervisor,
pregnancy_supervisor,
stunting,
symptommanager,
tb,
wasting,
)

class GenerateDataChains(BaseScenario):
def __init__(self):
super().__init__()
self.seed = 0
self.start_date = Date(2010, 1, 1)
self.end_date = self.start_date + pd.DateOffset(months=13)
self.pop_size = 1000
self._scenarios = self._get_scenarios()
self.number_of_draws = len(self._scenarios)
self.runs_per_draw = 50
self.generate_event_chains = True

def log_configuration(self):
return {
'filename': 'generate_event_chains',
'directory': Path('./outputs'), # <- (specified only for local running)
'custom_levels': {
'*': logging.WARNING,
'tlo.methods.demography': logging.INFO,
'tlo.methods.events': logging.INFO,
'tlo.methods.demography.detail': logging.WARNING,
'tlo.methods.healthburden': logging.INFO,
'tlo.methods.healthsystem.summary': logging.INFO,
}
}

def modules(self):
# MODIFY
# Here instead of running full module
return [demography.Demography(resourcefilepath=self.resources),
enhanced_lifestyle.Lifestyle(resourcefilepath=self.resources),
healthburden.HealthBurden(resourcefilepath=self.resources),
symptommanager.SymptomManager(resourcefilepath=self.resources, spurious_symptoms=False),
rti.RTI(resourcefilepath=self.resources),
healthseekingbehaviour.HealthSeekingBehaviour(resourcefilepath=self.resources),
#simplified_births.SimplifiedBirths(resourcefilepath=resourcefilepath),
healthsystem.HealthSystem(resourcefilepath=self.resources,
mode_appt_constraints=1,
cons_availability='all')]

# return (
# fullmodel(resourcefilepath=self.resources)
# + [ImprovedHealthSystemAndCareSeekingScenarioSwitcher(resourcefilepath=self.resources)]
# )
"""
def draw_parameters(self, draw_number, rng):
return mix_scenarios(
get_parameters_for_status_quo(),
{
'HealthSystem': {
'Service_Availability': list(self._scenarios.values())[draw_number],
},
}
)

def _get_scenarios(self) -> Dict[str, list[str]]:
Return the Dict with values for the parameter `Service_Availability` keyed by a name for the scenario.
The sequences of scenarios systematically omits one of the TREATMENT_ID's that is defined in the model.

# Generate list of TREATMENT_IDs and filter to the resolution needed
treatments = get_filtered_treatment_ids(depth=2)
treatments_RTI = [item for item in treatments if 'Rti' in item]

# Return 'Service_Availability' values, with scenarios for everything, nothing, and ones for which each
# treatment is omitted
service_availability = dict({"Everything": ["*", "Nothing": []})
#service_availability.update(
# {f"No {t.replace('_*', '*')}": [x for x in treatments if x != t] for t in treatments_RTI}
#)

return service_availability

"""
def draw_parameters(self, draw_number, rng):
if draw_number < self.number_of_draws:
return list(self._scenarios.values())[draw_number]
else:
return

# case 1: gfHE = -0.030, factor = 1.01074
# case 2: gfHE = -0.020, factor = 1.02116
# case 3: gfHE = -0.015, factor = 1.02637
# case 4: gfHE = 0.015, factor = 1.05763
# case 5: gfHE = 0.020, factor = 1.06284
# case 6: gfHE = 0.030, factor = 1.07326

def _get_scenarios(self) -> Dict[str, Dict]:
#Return the Dict with values for the parameters that are changed, keyed by a name for the scenario.

treatments = get_filtered_treatment_ids(depth=2)
treatments_RTI = [item for item in treatments if 'Rti' in item]

# Return 'Service_Availability' values, with scenarios for everything, nothing, and ones for which each
# treatment is omitted
service_availability = dict({"Everything": ["*"], "Nothing": []})
service_availability.update(
{f"No {t.replace('_*', '*')}": [x for x in treatments if x != t] for t in treatments_RTI}
)
print(service_availability.keys())

return {
# =========== STATUS QUO ============
"Baseline":
mix_scenarios(
self._baseline(),
{
"HealthSystem": {
"Service_Availability": service_availability["No Rti_BurnManagement*"],
},
}
),

}

def _baseline(self) -> Dict:
#Return the Dict with values for the parameter changes that define the baseline scenario.
return mix_scenarios(
get_parameters_for_status_quo(),
{
"HealthSystem": {
"mode_appt_constraints": 1, # <-- Mode 1 prior to change to preserve calibration
"cons_availability": "all",
}
},
)

if __name__ == '__main__':
from tlo.cli import scenario_run

scenario_run([__file__])
Loading