From 40834d9f6b3daff918126573b2b93ac3d826fa3d Mon Sep 17 00:00:00 2001 From: tommoral Date: Thu, 30 May 2024 11:18:37 +0200 Subject: [PATCH 1/4] ENH expect pd.DataFrame as imput of FaDIn.fit --- fadin/solver.py | 8 ++++++-- fadin/utils/utils.py | 46 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/fadin/solver.py b/fadin/solver.py index 18a3a7f..8839e96 100644 --- a/fadin/solver.py +++ b/fadin/solver.py @@ -252,8 +252,12 @@ def fit(self, events, end_time): Parameters ---------- - events : list of array of size number of timestamps, - list size is self.n_dim. + events : pd.DataFrame + The events to infer the Hawkes Process's parameters. + The event should be formatted as a pd.DataFrame, with columns: + - `'time'` or index: to represent the time of the event. + - `'type'`: to annotate which event type the event belongs to. + - `'mark'` (optional): to represent the mark of the event. end_time : int The end time of the Hawkes process. diff --git a/fadin/utils/utils.py b/fadin/utils/utils.py index 36e9270..eff85a2 100644 --- a/fadin/utils/utils.py +++ b/fadin/utils/utils.py @@ -68,16 +68,46 @@ def check_params(list_params, number_params): def projected_grid(events, grid_step, n_grid): """Project the events on the defined grid. + + Parameters + ---------- + events : pd.DataFrame + The events to infer the Hawkes Process's parameters. + The event should be formatted as a pd.DataFrame, with columns: + - `'time'` or index: to represent the time of the event. + - `'type'`: to annotate which event type the event belongs to. + - `'mark'` (optional): to represent the mark of the event. + + grid_step : float + The step of the grid. + + n_grid : int + The number of grid points. + + Returns + ------- + events_grid : torch.Tensor + The events projected on the grid. """ - n_dim = len(events) - # size_discret = int(1 / grid_step) + # compute time of the event on the grid + events['time_g'] = (events['time'] / grid_step).round().astype(int) + + # Compute sum of the marks, or number of events at each time in the grid + if 'mark' in events.columns: + events = events.groupby(['type', 'time_g'])['mark'].sum() + else: + events = events.groupby(['type', 'time_g']).count() + + # Make sure the resulting DataFrame has the right columns + events.name = "mark_sum" + events = events.reset_index() + + # Initialize the grid and fill it with events + n_dim = events['type'].nunique() events_grid = torch.zeros(n_dim, n_grid) - for i in range(n_dim): - ei_torch = torch.tensor(events[i]) - temp = torch.round(ei_torch / grid_step).long() # * grid_step - # temp2 = torch.round(temp * size_discret) - idx, data = np.unique(temp, return_counts=True) - events_grid[i, idx] += torch.tensor(data) + + i, idx = events['type'].values, events['time_g'].values + events_grid[i, idx] += torch.tensor(events["mark_sum"]) return events_grid From 9e4f8183fdf65503b48e48a482614c9ea60ee1bd Mon Sep 17 00:00:00 2001 From: tommoral Date: Thu, 30 May 2024 11:37:26 +0200 Subject: [PATCH 2/4] FIX generate events as pandas --- fadin/utils/utils.py | 2 +- fadin/utils/utils_simu.py | 132 +++++++++----------------------------- 2 files changed, 33 insertions(+), 101 deletions(-) diff --git a/fadin/utils/utils.py b/fadin/utils/utils.py index eff85a2..2976794 100644 --- a/fadin/utils/utils.py +++ b/fadin/utils/utils.py @@ -96,7 +96,7 @@ def projected_grid(events, grid_step, n_grid): if 'mark' in events.columns: events = events.groupby(['type', 'time_g'])['mark'].sum() else: - events = events.groupby(['type', 'time_g']).count() + events = events.groupby(['type', 'time_g']).size() # Make sure the resulting DataFrame has the right columns events.name = "mark_sum" diff --git a/fadin/utils/utils_simu.py b/fadin/utils/utils_simu.py index 25dd64b..ce38644 100644 --- a/fadin/utils/utils_simu.py +++ b/fadin/utils/utils_simu.py @@ -1,16 +1,30 @@ import numpy as np -from scipy.optimize import minimize_scalar -from scipy.interpolate import interp1d -from scipy.integrate import simps -import scipy.stats as stats +import pandas as pd + +import scipy +from scipy import stats def find_max(intensity_function, duration): """Find the maximum intensity of a function.""" - res = minimize_scalar(lambda x: -intensity_function(x), bounds=(0, duration)) + res = scipy.optimize.minimize_scalar( + lambda x: -intensity_function(x), bounds=(0, duration) + ) return -res.fun +def to_pandas(events): + + return pd.DataFrame({ + 'time': np.concatenate(events), + 'type': np.concatenate([[i] * len(events[i]) for i in range(n_dim)]) + }) + + +def from_pandas(events): + return [events.query("type == @i") for i in events['type'].unique()] + + def check_random_state(seed): """Turn seed into a np.random.RandomState instance. If seed is None, return the RandomState singleton used by np.random. @@ -84,7 +98,7 @@ def __init__(self, custom_density, params=dict(), kernel_length=10): # function to normalise the pdf over chosen domain def normalisation(self, x): - return simps(self.pdf(x), x) + return scipy.integrate.simps(self.pdf(x), x) def create_cdf_ppf(self): # define normalization support with the given kernel length @@ -98,8 +112,10 @@ def create_cdf_ppf(self): # make sure cdf bounded on [0,1] my_cdf = my_cdf / my_cdf[-1] # create cdf and ppf - func_cdf = interp1d(discrete_time, my_cdf) - func_ppf = interp1d(my_cdf, discrete_time, fill_value='extrapolate') + func_cdf = scipy.interpolate.interp1d(discrete_time, my_cdf) + func_ppf = scipy.interpolate.interp1d( + my_cdf, discrete_time, fill_value='extrapolate' + ) return func_cdf, func_ppf # pdf function for averaged normals @@ -151,7 +167,7 @@ def simu_poisson(end_time, intensity, upper_bound=None, random_state=None): assert isinstance(intensity, (int, float)) n_events = rng.poisson(lam=intensity*end_time, size=1) events = np.sort(rng.uniform(0, end_time, size=n_events)) - return events + return to_pandas(events) if upper_bound is None: upper_bound = find_max(intensity, end_time) @@ -164,7 +180,7 @@ def simu_poisson(end_time, intensity, upper_bound=None, random_state=None): # ogata's thinning algorithm accepted = intensity(ev_x) > ev_y events = np.sort(ev_x[accepted]) - return events + return to_pandas(events) def simu_multi_poisson(end_time, intensity, upper_bound=None, random_state=None): @@ -206,7 +222,7 @@ def simu_multi_poisson(end_time, intensity, upper_bound=None, random_state=None) for i in range(n_dim): evs = np.sort(rng.uniform(0, end_time, size=n_events[i])) events.append(evs) - return events + return to_pandas(events) if upper_bound is None: upper_bound = np.zeros(n_dim) @@ -221,7 +237,7 @@ def simu_multi_poisson(end_time, intensity, upper_bound=None, random_state=None) events.append(np.sort(ev_xi[accepted_i])) - return events + return to_pandas(events) def simu_hawkes_cluster(end_time, baseline, alpha, kernel, @@ -282,7 +298,7 @@ def simu_hawkes_cluster(end_time, baseline, alpha, kernel, upper_bound=upper_bound, random_state=random_state) gen = dict(gen0=immigrants) - events = immigrants.copy() + events = from_pandas(immigrants) it = 0 while len(gen[f'gen{it}']): @@ -317,7 +333,8 @@ def simu_hawkes_cluster(end_time, baseline, alpha, kernel, it += 1 - return events + + return to_pandas(events) def simu_hawkes_thinning(end_time, baseline, alpha, kernel, @@ -402,89 +419,4 @@ def simu_hawkes_thinning(end_time, baseline, alpha, kernel, del events[i][-1] events[i] = np.array(events[i]) - return events - - -# def simu_hawkes(end_time, baseline, alpha, kernel, upper_bound=None, -# random_state=None): -# """ Simulate a Hawkes process following an immigration-birth procedure. -# Edge effects may be reduced according to the second references below. - -# References: - -# Møller, J., & Rasmussen, J. G. (2006). Approximate simulation of Hawkes processes. -# Methodology and Computing in Applied Probability, 8, 53-64. - -# Møller, J., & Rasmussen, J. G. (2005). Perfect simulation of Hawkes processes. -# Advances in applied probability, 37(3), 629-646. - -# Parameters -# ---------- -# end_time : int | float -# Duration of the Poisson process. - -# baseline : float -# Baseline parameter of the Hawkes process. - -# alpha : float -# Weight parameter associated to the kernel function. - -# kernel: str -# The choice of the kernel for the simulation. -# Kernel available are probability distribution from scipy.stats module. - -# upper_bound : int, float or None, default=None -# Upper bound of the baseline function. If None, -# the maximum of the baseline is taken onto a finite discrete grid. - -# random_state : int, RandomState instance or None, default=None -# Set the numpy seed to 'random_state'. - -# Returns -# ------- -# events : array -# The timestamps of the point process' events. -# """ -# if random_state is None: -# np.random.seed(0) -# else: -# np.random.seed(random_state) - -# # Simulate events from baseline -# immigrants = simu_poisson(end_time, intensity=baseline, upper_bound=upper_bound) - -# # initialize -# gen = dict(gen0=immigrants) -# ev = [immigrants] -# sample_from_kernel = getattr(scipy.stats, kernel) # dic = [4, 3] - -# # generate offsprings -# it = 0 -# while len(gen[f'gen{it}']): -# Ci = gen[f'gen{it}'] - -# # number of descendents of an event follow a Poisson law of parameter alpha -# # the expected number of event induced by an event is then alpha - -# Di = np.random.poisson(lam=alpha, size=len(Ci)) - -# # sample interval range according to the given pdf (kernel of the parameter) -# Ci = np.repeat(Ci, repeats=Di) -# n = Di.sum() -# Ei = sample_from_kernel.rvs(size=n) # * dic -# Fi = Ci + Ei - - -# if len(Fi) > 0: -# gen[f'gen{it+1}'] = np.hstack(Fi) -# ev.append(np.hstack(Fi)) -# else: -# break -# it += 1 - -# # Add immigrants and sort -# events = np.hstack(ev) -# valid_events = events < end_time -# events = np.sort(events[valid_events]) - -# return events + return to_pandas(events) From 8035e9fe5eab857c2eabacf4595185858952393d Mon Sep 17 00:00:00 2001 From: tommoral Date: Thu, 30 May 2024 14:39:09 +0200 Subject: [PATCH 3/4] FIX tests --- fadin/utils/utils_simu.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fadin/utils/utils_simu.py b/fadin/utils/utils_simu.py index ce38644..f4f051e 100644 --- a/fadin/utils/utils_simu.py +++ b/fadin/utils/utils_simu.py @@ -17,12 +17,18 @@ def to_pandas(events): return pd.DataFrame({ 'time': np.concatenate(events), - 'type': np.concatenate([[i] * len(events[i]) for i in range(n_dim)]) + 'type': np.concatenate([ + [i] * len(evi) for i, evi in enumerate(events) + ]) }) def from_pandas(events): - return [events.query("type == @i") for i in events['type'].unique()] + assert 'mark' not in events.columns + return [ + events.query("type == @i")['time'].values + for i in events['type'].unique() + ] def check_random_state(seed): @@ -297,8 +303,9 @@ def simu_hawkes_cluster(end_time, baseline, alpha, kernel, immigrants = simu_multi_poisson(end_time, baseline, upper_bound=upper_bound, random_state=random_state) + immigrants = from_pandas(immigrants) gen = dict(gen0=immigrants) - events = from_pandas(immigrants) + events = immigrants.copy() it = 0 while len(gen[f'gen{it}']): @@ -333,7 +340,6 @@ def simu_hawkes_cluster(end_time, baseline, alpha, kernel, it += 1 - return to_pandas(events) @@ -363,8 +369,8 @@ def simu_hawkes_thinning(end_time, baseline, alpha, kernel, kernel: str The choice of the kernel for the simulation. Kernel available are probability distribution from scipy.stats module. - Note that this function will automatically convert the scipy kernel to a - finite support kernel of size 'kernel_length'. + Note that this function will automatically convert the scipy kernel to + a finite support kernel of size 'kernel_length'. kernel_length: int Length of kernels in the Hawkes process. From 44c5acd370140d0008e6d06bfe1906a35b3d8557 Mon Sep 17 00:00:00 2001 From: tommoral Date: Thu, 30 May 2024 14:44:58 +0200 Subject: [PATCH 4/4] FIX add missing deps --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e6f5458..89c028d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,8 @@ dependencies = [ "torch>=1.11", "scipy>=1.8.0", "numba>=0.55.2", - "matplotlib>=3.7.2" + "matplotlib>=3.7.2", + "pandas>=2.0" ]