Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/changes/944.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Enhanced phase-dispersion-minimisation search by enabling weighted events.
85 changes: 53 additions & 32 deletions stingray/pulse/pulsar.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,17 @@ def phase_exposure(start_time, stop_time, period, nbin=16, gti=None):
return expo / np.max(expo)


def fold_events(times, *frequency_derivatives, **opts):
def fold_events(
times,
*frequency_derivatives,
mode="ef",
nbin=16,
weights=1,
gti=None,
ref_time=0,
expocorr=False,
variances=None,
):
"""Epoch folding with exposure correction.

By default, the keyword `times` accepts a list of
Expand All @@ -232,21 +242,21 @@ def fold_events(times, *frequency_derivatives, **opts):
nbin : int, optional, default 16
The number of bins in the pulse profile

weights : float or array of floats, optional
weights : float or array of floats, optional, default 1
The weights of the data. It can either be specified as a single value
for all points, or an array with the same length as ``time``

gti : [[gti0_0, gti0_1], [gti1_0, gti1_1], ...], optional
gti : [[gti0_0, gti0_1], [gti1_0, gti1_1], ...], optional, default None
Good time intervals

ref_time : float, optional, default 0
Reference time for the timing solution

expocorr : bool, default False
expocorr : bool, optional, default False
Correct each bin for exposure (use when the period of the pulsar is
comparable to that of GTIs)

mode : str, ["ef", "pdm"], default "ef"
mode : str, ["ef", "pdm"], optional, default "ef"
Whether to calculate the epoch folding or phase dispersion
minimization folded profile. For "ef", it calculates the (weighted)
sum of the data points in each phase bin, for "pdm", the variance
Expand All @@ -264,23 +274,12 @@ def fold_events(times, *frequency_derivatives, **opts):
The uncertainties on the pulse profile
"""

mode = opts.pop("mode", "ef")
nbin = opts.pop("nbin", 16)
weights = opts.pop("weights", 1)
# If no key is passed, *or gti is None*, defaults to the
# initial and final event
gti = opts.pop("gti", None)
if gti is None:
gti = [[times[0], times[-1]]]
# Be safe if gtis are a list
gti = np.asanyarray(gti)
ref_time = opts.pop("ref_time", 0)
expocorr = opts.pop("expocorr", False)

if opts:
raise ValueError(
f"Unidentified keyword(s) to fold_events: {', '.join([k for k in opts.keys()])} \n Please refer to the description of the function for optional parameters."
)

if not isinstance(weights, Iterable):
weights *= np.ones(len(times))
Expand Down Expand Up @@ -315,23 +314,38 @@ def fold_events(times, *frequency_derivatives, **opts):
raw_profile_err = raw_profile_err / expo_norm

elif mode == "pdm":
# Convert the weights to a numpy array just in case it is a list
weights = np.asarray(weights)
if np.allclose(weights, 1.0):
raise ValueError(
"Can only calculate PDM for binned light curves!"
+ "`weights` attribute must be set to fluxes!"
)

raw_profile, bins, bin_idx = scipy.stats.binned_statistic(
phases, weights, statistic=np.var, bins=np.linspace(0, 1, nbin + 1)
# For weighted pdm, the measured uncertainties (variances) for each event are required
# set the event_weights as 1/variances or else 1s if variances=None
event_weights = (
(1.0 / np.asarray(variances)) if variances is not None else np.ones_like(weights)
)

# I need the variance uncorrected for the number of data points in each
# bin, so I need to find that first, and then multiply
# bin_idx should be from the list [1, ..., nbin], although some might not appear
# histogram bin-edges set as [0.5, 1.5, ..., nbin - 0.5, nbin + 0.5]
# to include the bin_idx int values
bincounts, _ = np.histogram(bin_idx, bins=np.arange(0.5, nbin + 1.5))
raw_profile = raw_profile * bincounts
bins = np.linspace(0, 1, nbin + 1)
# phases are already fractional values in [0, 1]
# casting float to int removes the fractional part
bin_indices = (phases * nbin).astype(int)
# to wrap: for phase = 1 then bin_index = nbin; clipping changes it to bin_index = 0
bin_indices = np.clip(bin_indices, 0, nbin - 1)

# for efficiency, the sum-of-squared deviations for each bin is split
# sosdev = sum_in_bin( (values_in_bin - mean_in_bin)**2 )
# mean_in_bin = sum_in_bin(values_in_bin) / n_in_bin
# so need n_in_bin, sum_in_bin, sos_in_bin (sum-of-squares)
# now compute the stats for each bin using bincount
# minlength=nbin prevents array shortening when a bin index is not represented in bin_indices
n_in_bin = np.bincount(bin_indices, weights=event_weights, minlength=nbin)
sum_in_bin = np.bincount(bin_indices, weights=weights * event_weights, minlength=nbin)
sos_in_bin = np.bincount(bin_indices, weights=weights**2 * event_weights, minlength=nbin)
# put it together to make sum-of-squared deviations: sosdev
raw_profile = sos_in_bin - sum_in_bin**2 / n_in_bin

# dummy array for the error, which we don't have for the variance
raw_profile_err = np.zeros_like(raw_profile)
Expand Down Expand Up @@ -369,18 +383,19 @@ def ef_profile_stat(profile, err=None):
return np.sum((profile - mean) ** 2 / err**2)


def pdm_profile_stat(profile, sample_var, nsample):
def pdm_profile_stat(profile, sum_dev2, nsample):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is changing the name of the variable needed? Also, is the variable something different from before?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @matteobachetti, thanks for the review. Sorry for the long wait in coming back to this!

The variable name has been changed to reflect that it is no longer the variance but rather the numerator in typical variance calculations, which is the sum of squared deviances, i.e. (x - xbar)**2. This is calculated in the fold_events function in the more performant (but made obscure) method. The variance is obtained from the sum of squared deviances by dividing by the number of degrees of freedom. I thought this makes the statistics function pdm_profile_stat a bit more cohesive now as it computes statistical values, using the degrees of freedom information available to it, whereas the fold_events function is responsible for computing the profile and its higher order moments.

Copy link
Member

@matteobachetti matteobachetti Feb 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nabilbrice I see. This might be problematic, because as a policy we do not make breaking changes to the code, if not at a major release and after a period where we mark something as deprecated.
Would it be possible to make this new argument optional, and use it if not None instead of sample_var, rather than changing the signature of the function?

"""Calculate the phase dispersion minimization
statistic following Stellingwerf (1978)

Parameters
----------
profile : array
The PDM pulse profile (variance as a function
of phase)
The PDM pulse profile (normalised sum-of-squared-deviations),
which uses the within bin means

sample_var : float
The total population variance of the sample
sum_dev2 : float
The sum-of-squared-deviations of the whole sample,
which uses the grand (whole sample together) mean

nsample : int
The number of time bins in the initial time
Expand All @@ -391,8 +406,14 @@ def pdm_profile_stat(profile, sample_var, nsample):
stat : float
The epoch folding statistics
"""
s2 = np.sum(profile) / (nsample - len(profile))
stat = s2 / sample_var
# Get the mean-of-squared-deviations from sum-of-squared-deviations by
# considering the number of degrees of freedom:
# - (nsample - nbin) since there are nbin means used for
# the within bin sum-of-squared-deviations, which is stored in the profile
# - (nsample - 1) for the grand sum_dev2 because there is 1 grand mean used
mean_dev2_in_bin = np.sum(profile) / (nsample - len(profile))
mean_dev2_grand = sum_dev2 / (nsample - 1)
stat = mean_dev2_in_bin / mean_dev2_grand
return stat


Expand Down
29 changes: 25 additions & 4 deletions stingray/pulse/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,15 @@ def stat_fun(t, f, fd=0, **kwargs):


def phase_dispersion_search(
times, flux, frequencies, nbin=128, segment_size=5000, expocorr=False, gti=None, fdots=0
times,
flux,
frequencies,
nbin=128,
segment_size=5000,
expocorr=False,
gti=None,
fdots=0,
variances=None,
):
"""Performs folding at trial frequencies in time series data (i.e.~a light curve
of flux or photon counts) and computes the Phase Dispersion Minimization statistic.
Expand Down Expand Up @@ -227,9 +235,21 @@ def phase_dispersion_search(

def stat_fun(t, f, fd=0, **kwargs):
bins, profile, _ = fold_events(t, f, fd, **kwargs, mode="pdm")
len_flux = len(kwargs["weights"])
sigma = np.var(kwargs["weights"]) * len_flux / (len_flux - 1)
return pdm_profile_stat(profile, sigma, len_flux)
flux = kwargs["weights"]
len_flux = len(flux)
# Get the variances from kwargs (possibly modified) rather than as a closure
variances_local = kwargs.get("variances", None)
event_weights = (
(1.0 / np.asarray(variances_local))
if variances_local is not None
else np.ones_like(flux)
)
mean_grand = np.average(flux, weights=event_weights)
dev2_grand = (flux - mean_grand) ** 2

sosdev_grand = np.dot(dev2_grand, event_weights)

return pdm_profile_stat(profile, sosdev_grand, len_flux)

return _folding_search(
stat_fun,
Expand All @@ -242,6 +262,7 @@ def stat_fun(t, f, fd=0, **kwargs):
gti=gti,
nbin=nbin,
fdots=fdots,
variances=variances,
)


Expand Down
53 changes: 51 additions & 2 deletions stingray/pulse/tests/test_pulse.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,58 @@ def test_stat(self):
def test_pdm_stat(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please test that the new keyword to folding_search also works.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review on this @matteobachetti! I've now added a new test in 8e312a2. Let me know if it is suitable! It tests two scenarios (using the new variances argument and not) when there is a point that has an extremely high variance, while the other points are very uniform.

"""Test pulse phase calculation, frequency only."""
prof = np.array([1, 1, 1, 1, 1])
sample_var = 2.0
sample_var = 2
nsample = 10
np.testing.assert_array_almost_equal(pdm_profile_stat(prof, sample_var, nsample), 0.5)
sum_dev2 = sample_var * (nsample - 1)
np.testing.assert_array_almost_equal(pdm_profile_stat(prof, sum_dev2, nsample), 0.5)

def test_weighted_pdm_stat(self):
"""Test the PDM calculation with weighted events (variances)."""
period = 1.0
nbin = 5

# Constant signal: flux=10 everywhere. The pdm should be 0.
times = np.array([0.1, 0.3, 0.5, 0.7, 0.9])
fluxes = np.array([10.0, 10.0, 10.0, 10.0, 10.0])
std_var = (
0.1 # instead of 1.0, to check that avoiding division by zero isn't changing results
)
variances_base = np.ones_like(fluxes) * std_var

_, profile_base, _ = fold_events(
times, 1 / period, nbin=nbin, weights=fluxes, mode="pdm", ref_time=0
)
assert np.allclose(profile_base, 0.0) # equal with the mean at every bin

# An outlier point at 0.15 (Bin 0) with flux 1000
times_out = np.append(times, 0.15)
fluxes_out = np.append(fluxes, 1000.0)

# High variance for the outlier, standard for the rest
outlier_var = 1.0e6
variances = np.append(variances_base, outlier_var)

# Unweighted PDM on dirty data (should be skewed)
_, profile_unweighted, _ = fold_events(
times_out, 1 / period, nbin=nbin, weights=fluxes_out, mode="pdm", ref_time=0
)
assert profile_unweighted[0] > 1000.0 # the profile bin should deviate from 0

# Weighted PDM on dirty data (should effectively ignore outlier)
_, profile_weighted, _ = fold_events(
times_out,
1 / period,
nbin=nbin,
weights=fluxes_out,
variances=variances,
mode="pdm",
ref_time=0,
)
# With high variance, the outlier's weight is tiny.
# The bin statistics should be dominated by the point with flux 10.
# The result should be very close to the baseline (0) and smaller than the unweighted result.
assert np.all(profile_weighted <= profile_unweighted)
assert np.allclose(profile_weighted, 0.0, atol=2.0)

def test_zn(self):
"""Test pulse phase calculation, frequency only."""
Expand Down
2 changes: 1 addition & 1 deletion stingray/pulse/tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def test_plot_phaseogram_direct(self):

def test_search_wrong_key_fails(self):
with pytest.raises(
ValueError, match=r"Unidentified keyword\(s\) to fold_events: fdot, fddot"
TypeError, match=r"fold_events\(\) got an unexpected keyword argument 'fdot'"
):
phase, prof, _ = fold_events(self.event_times, self.pulse_frequency, fdot=12, fddot=34)

Expand Down