diff --git a/libpysal/graph/_kernel.py b/libpysal/graph/_kernel.py index 344ec30c6..86ffd2c2f 100644 --- a/libpysal/graph/_kernel.py +++ b/libpysal/graph/_kernel.py @@ -2,6 +2,8 @@ import pandas from scipy import optimize, sparse, spatial, stats +from libpysal.kernels import _kernel_functions + from ._utils import ( CoplanarError, _build_coplanarity_lookup, @@ -22,59 +24,6 @@ _VALID_GEOMETRY_TYPES = ["Point"] -def _triangular(distances, bandwidth): - u = numpy.clip(distances / bandwidth, 0, 1) - return 1 - u - - -def _parabolic(distances, bandwidth): - u = numpy.clip(distances / bandwidth, 0, 1) - return 0.75 * (1 - u**2) - - -def _gaussian(distances, bandwidth): - u = distances / bandwidth - return numpy.exp(-((u / 2) ** 2)) / (numpy.sqrt(2 * numpy.pi)) - - -def _bisquare(distances, bandwidth): - u = numpy.clip(distances / bandwidth, 0, 1) - return (15 / 16) * (1 - u**2) ** 2 - - -def _cosine(distances, bandwidth): - u = numpy.clip(distances / bandwidth, 0, 1) - return (numpy.pi / 4) * numpy.cos(numpy.pi / 2 * u) - - -def _exponential(distances, bandwidth): - u = distances / bandwidth - return numpy.exp(-u) - - -def _boxcar(distances, bandwidth): - r = (distances < bandwidth).astype(int) - return r - - -def _identity(distances, _): - return distances - - -_kernel_functions = { - "triangular": _triangular, - "parabolic": _parabolic, - "gaussian": _gaussian, - "bisquare": _bisquare, - "cosine": _cosine, - "boxcar": _boxcar, - "discrete": _boxcar, - "exponential": _exponential, - "identity": _identity, - None: _identity, -} - - def _kernel( coordinates, bandwidth=None, @@ -86,6 +35,7 @@ def _kernel( taper=True, coplanar="raise", resolve_isolates=True, + exclude_self_weights=True, ): """ Compute a kernel function over a distance matrix. @@ -134,6 +84,8 @@ def _kernel( remove links with a weight equal to zero resolve_isolates : bool Try to resolve isolates. Can be disabled if we are dealing with cliques later. + exclude_self_weights : bool (default: True) + Remove self-weights """ if metric != "precomputed": coordinates, ids, _ = _validate_geometry_input( @@ -188,11 +140,12 @@ def _kernel( data = sq.flatten() i = numpy.tile(numpy.arange(sq.shape[0]), sq.shape[0]) j = numpy.repeat(numpy.arange(sq.shape[0]), sq.shape[0]) - # remove diagonal - data = numpy.delete(data, numpy.arange(0, data.size, sq.shape[0] + 1)) - i = numpy.delete(i, numpy.arange(0, i.size, sq.shape[0] + 1)) - j = numpy.delete(j, numpy.arange(0, j.size, sq.shape[0] + 1)) - # construct sparse + + if exclude_self_weights: + data = numpy.delete(data, numpy.arange(0, data.size, sq.shape[0] + 1)) + i = numpy.delete(i, numpy.arange(0, i.size, sq.shape[0] + 1)) + j = numpy.delete(j, numpy.arange(0, j.size, sq.shape[0] + 1)) + d = sparse.csc_array((data, (i, j))) else: d = sparse.csc_array(coordinates) diff --git a/libpysal/graph/tests/test_builders.py b/libpysal/graph/tests/test_builders.py index c384d0dd5..f601e76a4 100644 --- a/libpysal/graph/tests/test_builders.py +++ b/libpysal/graph/tests/test_builders.py @@ -228,6 +228,7 @@ def setup_method(self): self.gdf_str = self.gdf.set_index("placeid") def test_kernel_precompute(self): + pytest.importorskip("pyproj") sklearn = pytest.importorskip("sklearn") df = gpd.read_file(geodatasets.get_path("nybb")) df = df.to_crs(df.estimate_utm_crs()) @@ -237,26 +238,26 @@ def test_kernel_precompute(self): g = graph.Graph.build_kernel(distmat, metric="precomputed") expected = np.array( [ - 0.126, - 0.266, - 0.174, - 0.071, - 0.126, - 0.329, - 0.311, - 0.291, - 0.266, - 0.329, - 0.31, - 0.205, - 0.174, - 0.311, - 0.31, - 0.339, - 0.071, - 0.291, - 0.205, - 0.339, + 0.04, + 0.177, + 0.076, + 0.013, + 0.04, + 0.271, + 0.242, + 0.212, + 0.177, + 0.271, + 0.241, + 0.105, + 0.076, + 0.242, + 0.241, + 0.288, + 0.013, + 0.212, + 0.105, + 0.288, ] ) diff --git a/libpysal/graph/tests/test_kernel.py b/libpysal/graph/tests/test_kernel.py index dfe0d41dc..d275b2c6e 100644 --- a/libpysal/graph/tests/test_kernel.py +++ b/libpysal/graph/tests/test_kernel.py @@ -121,7 +121,7 @@ def test_ids(ids, grocs): data = grocs.set_index(ids) if ids else grocs head, tail, _ = _kernel(data) np.testing.assert_array_equal(pd.unique(head), data.index) - assert np.in1d(tail, data.index).all() + assert np.isin(tail, data.index).all() @pytest.mark.network @@ -210,8 +210,8 @@ def test_kernels(kernel, grocs): assert weight.mean() == pytest.approx(0.10312196315841769) assert weight.max() == pytest.approx(0.749881829575671) elif kernel == "gaussian": - assert weight.mean() == pytest.approx(0.19932294761630429) - assert weight.max() == pytest.approx(0.3989265663183409) + assert weight.mean() == pytest.approx(0.13787969156713978) + assert weight.max() == pytest.approx(0.39891085285421685) elif kernel == "bisquare": assert weight.mean() == pytest.approx(0.09084085210598618) assert weight.max() == pytest.approx(0.9372045972129259) @@ -255,6 +255,7 @@ def test_bandwidth(data, bandwidth): ], ) def test_metric(metric, grocs): + pytest.importorskip("pyproj") data = grocs.to_crs(4326) if metric == "haversine" else grocs if not HAS_SKLEARN and metric in ["chebyshev", "haversine"]: pytest.skip("metric not supported by scipy") @@ -293,6 +294,7 @@ def test_metric(metric, grocs): ], ) def test_metric_k(metric, grocs): + pytest.importorskip("pyproj") data = grocs.to_crs(4326) if metric == "haversine" else grocs if not HAS_SKLEARN and metric in ["chebyshev", "haversine"]: pytest.skip("metric not supported by scipy") @@ -329,7 +331,7 @@ def test_coplanar(grocs): [grocs, grocs.iloc[:10], grocs.iloc[:3]], ignore_index=True ) # plain kernel - head, tail, weight = _kernel(grocs_duplicated) + head, tail, weight = _kernel(grocs_duplicated, taper=False) assert head.shape[0] == len(grocs_duplicated) * (len(grocs_duplicated) - 1) assert tail.shape == head.shape assert weight.shape == head.shape @@ -347,7 +349,7 @@ def test_coplanar(grocs): np.testing.assert_array_equal(pd.unique(head), grocs_duplicated.index) # k, clique - head, tail, weight = _kernel(grocs_duplicated, k=2, coplanar="clique") + head, tail, weight = _kernel(grocs_duplicated, k=2, coplanar="clique", taper=False) assert head.shape[0] >= len(grocs_duplicated) * 2 assert tail.shape == head.shape assert weight.shape == head.shape diff --git a/libpysal/graph/tests/test_triangulation.py b/libpysal/graph/tests/test_triangulation.py index c60ef26aa..1d55df72a 100644 --- a/libpysal/graph/tests/test_triangulation.py +++ b/libpysal/graph/tests/test_triangulation.py @@ -257,24 +257,25 @@ def test_ids(ids, stores_unique): def test_kernel(): _, _, weight = _delaunay(cau_coords, kernel="gaussian") + expected = np.array( [ - 0.231415, - 0.307681, - 0.395484, - 0.231415, - 0.237447, - 0.057774, - 0.307681, - 0.237447, - 0.123151, - 0.319563, - 0.057774, - 0.123151, - 0.035525, - 0.395484, - 0.319563, - 0.035525, + 0.134237, + 0.237297, + 0.392056, + 0.134237, + 0.141327, + 0.008367, + 0.237297, + 0.141327, + 0.038016, + 0.255978, + 0.008367, + 0.038016, + 0.003163, + 0.392056, + 0.255978, + 0.003163, ] ) diff --git a/libpysal/kernels.py b/libpysal/kernels.py new file mode 100644 index 000000000..123d775fc --- /dev/null +++ b/libpysal/kernels.py @@ -0,0 +1,283 @@ +"""kernels.py + +This module defines a collection of common kernel functions used for +distance-based weighting in spatial analysis, nonparametric regression, +and density estimation. + +Each kernel function takes as input an array of distances and a +bandwidth parameter and returns an array of values for the kernel +evaluated over the distances with the specified bandwidth. + +A general ``kernel()`` dispatcher is provided to apply a named kernel or a +user-supplied callable. + +Available kernels: + - ``triangular`` + - ``parabolic`` (Epanechnikov) + - ``gaussian`` + - ``bisquare`` (quartic) + - ``cosine`` + - ``exponential`` + - ``boxcar`` (uniform) + - ``identity`` (raw distances) + +Mathematical Formulation +------------------------ + +All kernels are evaluated as: + +.. math:: + + K(z), \\quad \\text{where} \\ z = \\frac{d}{h} + +- :math:`d` is the distance. +- :math:`h` is the kernel bandwidth. +- For :math:`z > 1`, all kernels return :math:`K(z) = 0`. + +""" + +import numpy + + +def _trim(d, bandwidth): + """ + Normalize and clip distances to the range [0, 1]. + + Parameters + ---------- + d : ndarray + Array of distances. + bandwidth : float + Bandwidth parameter. + + Returns + ------- + ndarray + Clipped and normalized distances. + """ + return numpy.clip(numpy.abs(d) / bandwidth, 0, 1) + + +def _triangular(distances, bandwidth): + """ + Triangular kernel function. + + Parameters + ---------- + distances : ndarray + Array of distances. + bandwidth : float + Kernel bandwidth. + + Returns + ------- + ndarray + Triangular kernel weights. + """ + return 1 - _trim(distances, bandwidth) + + +def _parabolic(distances, bandwidth): + """ + Parabolic (Epanechnikov) kernel function. + + Parameters + ---------- + distances : ndarray + Array of distances. + bandwidth : float + Kernel bandwidth. + + Returns + ------- + ndarray + Parabolic kernel weights. + """ + z = _trim(distances, bandwidth) + return 0.75 * (1 - z**2) + + +def _gaussian(distances, bandwidth): + """ + Gaussian kernel function (truncated at z=1). + + Parameters + ---------- + distances : ndarray + Array of distances. + bandwidth : float + Kernel bandwidth. + + Returns + ------- + ndarray + Gaussian kernel weights. + """ + z = distances / bandwidth + exponent_term = -0.5 * (z**2) + c = 1 / numpy.sqrt(2 * numpy.pi) + k = c * numpy.exp(exponent_term) + return k + + +def _bisquare(distances, bandwidth): + """ + Bisquare (quartic) kernel function. + + Parameters + ---------- + distances : ndarray + Array of distances. + bandwidth : float + Kernel bandwidth. + + Returns + ------- + ndarray + Bisquare kernel weights. + """ + z = numpy.clip(distances / bandwidth, 0, 1) + return (15 / 16) * (1 - z**2) ** 2 + + +def _cosine(distances, bandwidth): + """ + Cosine kernel function. + + Parameters + ---------- + distances : ndarray + Array of distances. + bandwidth : float + Kernel bandwidth. + + Returns + ------- + ndarray + Cosine kernel weights. + """ + z = numpy.clip(distances / bandwidth, 0, 1) + return (numpy.pi / 4) * numpy.cos(numpy.pi / 2 * z) + + +def _exponential(distances, bandwidth): + """ + Exponential kernel function, truncated at z=1. + + Parameters + ---------- + distances : ndarray + Array of distances. + bandwidth : float + Kernel bandwidth. + + Returns + ------- + ndarray + Exponential kernel weights. + """ + z = distances / bandwidth + return numpy.exp(-z) + + +def _boxcar(distances, bandwidth): + """ + Boxcar (uniform) kernel function. + + Parameters + ---------- + distances : ndarray + Array of distances. + bandwidth : float + Kernel bandwidth. + + Returns + ------- + ndarray + Binary weights: 1 if distance < bandwidth, else 0. + """ + distances = numpy.asarray(distances) + return (distances < bandwidth).astype(float) + + +def _identity(distances, _): + """ + Identity kernel (no weighting, returns raw distances). + + Parameters + ---------- + distances : ndarray + Array of distances. + _ : float + Unused bandwidth parameter. + + Returns + ------- + ndarray + The raw input distances. + """ + return distances + + +_kernel_functions = { + "triangular": _triangular, + "parabolic": _parabolic, + "gaussian": _gaussian, + "bisquare": _bisquare, + "cosine": _cosine, + "boxcar": _boxcar, + "discrete": _boxcar, + "exponential": _exponential, + "identity": _identity, + None: _identity, +} + + +def kernel(distances, bandwidth, kernel="gaussian", taper=True, decay=False): + """Evaluate a kernel function over a distance array. + + Parameters + ---------- + distances : ndarray + Array of distances. + bandwidth : float + Kernel bandwidth. + kernel : str or callable, optional + The kernel function to use. If a string, must be one of the predefined + kernel names: 'triangular', 'parabolic', 'gaussian', 'bisquare', + 'cosine', 'boxcar', 'discrete', 'exponential', 'identity'. + If callable, it should have the signature `(distances, bandwidth)`. + If None, the 'identity' kernel is used. + taper : bool (default: True) + Set kernel = 0 for all distances exceeding the bandwith. To + evaluate kernel beyond bandwith set taper=False. + decay : bool (default: False) + whether to calculate the kernel using the decay formulation. + In the decay form, a kernel measures the distance decay in + similarity between observations. It varies from from maximal + similarity (1) at a distance of zero to minimal similarity (0 + or negative) at some very large (possibly infinite) distance. + Otherwise, kernel functions are treated as proper + volume-preserving probability distributions. + + Returns + ------- + ndarray + Kernel function evaluated at distance values. + + """ + if callable(kernel): + func = kernel + elif kernel is None: + func = _kernel_functions[None] + else: + func = _kernel_functions[kernel] + + k = func(distances, bandwidth) + if taper: + k[distances > bandwidth] = 0.0 + + if decay: + k /= func(0.0, bandwidth) + + return k diff --git a/libpysal/tests/test_kernels.py b/libpysal/tests/test_kernels.py new file mode 100644 index 000000000..647178ffd --- /dev/null +++ b/libpysal/tests/test_kernels.py @@ -0,0 +1,115 @@ +import numpy as np +import pytest + +import libpysal.kernels as kernels + + +@pytest.fixture +def distances(): + return np.array([0.0, 0.5, 1.0, 1.5]) + + +@pytest.fixture +def bandwidth(): + return 1.0 + + +# --- Individual kernel tests --- + +def test_trim_clipping(): + d = np.array([-2.0, -1.0, 0.0, 1.0, 2.0]) + bw = 1.0 + result = kernels._trim(d, bw) + expected = np.clip(np.abs(d) / bw, 0, 1) + np.testing.assert_array_almost_equal(result, expected) + + +def test_triangular(distances, bandwidth): + expected = 1 - np.clip(np.abs(distances) / bandwidth, 0, 1) + np.testing.assert_array_almost_equal( + kernels._triangular(distances, bandwidth), expected + ) + + +def test_parabolic(distances, bandwidth): + z = np.clip(np.abs(distances) / bandwidth, 0, 1) + expected = 0.75 * (1 - z**2) + np.testing.assert_array_almost_equal( + kernels._parabolic(distances, bandwidth), expected + ) + + +def test_gaussian(distances, bandwidth): + z = distances / bandwidth + exponent = -0.5 * z**2 + c = 1 / np.sqrt(2 * np.pi) + expected = c * np.exp(exponent) + np.testing.assert_array_almost_equal( + kernels._gaussian(distances, bandwidth), expected + ) + + +def test_bisquare(distances, bandwidth): + z = np.clip(distances / bandwidth, 0, 1) + expected = (15 / 16) * (1 - z**2) ** 2 + np.testing.assert_array_almost_equal( + kernels._bisquare(distances, bandwidth), expected + ) + + +def test_cosine(distances, bandwidth): + z = np.clip(distances / bandwidth, 0, 1) + expected = (np.pi / 4) * np.cos(np.pi / 2 * z) + np.testing.assert_array_almost_equal( + kernels._cosine(distances, bandwidth), expected + ) + + +def test_exponential(distances, bandwidth): + z = distances / bandwidth + #expected = np.where(z <= 1, np.exp(-z), 0) + expected = np.exp(-z) + np.testing.assert_array_almost_equal( + kernels._exponential(distances, bandwidth), expected + ) + + +def test_boxcar(distances, bandwidth): + expected = (distances < bandwidth).astype(int) + np.testing.assert_array_equal( + kernels._boxcar(distances, bandwidth), expected + ) + + +def test_identity(distances, bandwidth): + np.testing.assert_array_equal( + kernels._identity(distances, bandwidth), distances + ) + + +# --- Dispatcher tests --- + +@pytest.mark.parametrize("name", [ + "triangular", "parabolic", "gaussian", "bisquare", + "cosine", "boxcar", "discrete", "exponential", "identity", None +]) +def test_kernel_dispatcher_names(distances, bandwidth, name): + # Should not raise or error + result = kernels.kernel(distances, bandwidth, kernel=name) + assert isinstance(result, np.ndarray) + assert result.shape == distances.shape + + +def test_kernel_dispatcher_callable(distances, bandwidth): + def linear_kernel(d, bw): + return 1 - np.clip(d / bw, 0, 1) + + result = kernels.kernel(distances, bandwidth, kernel=linear_kernel) + expected = linear_kernel(distances, bandwidth) + np.testing.assert_array_almost_equal(result, expected) + + +def test_kernel_dispatcher_invalid_name(distances, bandwidth): + with pytest.raises(KeyError): + kernels.kernel(distances, bandwidth, kernel="not-a-kernel") + diff --git a/libpysal/weights/distance.py b/libpysal/weights/distance.py index a8384d322..b2f19b992 100644 --- a/libpysal/weights/distance.py +++ b/libpysal/weights/distance.py @@ -372,8 +372,8 @@ class Kernel(W): bandwidth : float or array-like (optional) the bandwidth :math:`h_i` for the kernel. - fixed : binary - If true then :math:`h_i=h \\forall i`. If false then + fixed : bool + If True then :math:`h_i=h \\forall i`. If False then bandwidth is adaptive across observations. k : int the number of nearest neighbors to use for determining @@ -424,6 +424,10 @@ class Kernel(W): eps : float adjustment to ensure knn distance range is closed on the knnth observations + normalize : bool + If True (default) Gaussian kernel is normalized to integrate to 1. + If False K(0)=1. + Attributes ---------- @@ -530,8 +534,10 @@ def __init__( diagonal=False, distance_metric="euclidean", radius=None, + normalize=True, **kwargs, ): + self._normalize = normalize if radius is not None: distance_metric = "arc" if isKDTree(data): @@ -685,7 +691,10 @@ def _eval_kernel(self): ) z.append(zi) zs = z - # functions follow Anselin and Rey (2010) table 5.4 + + # functions follow Anselin and Rey (2014) Modern Spatial Econometircs + # in Practice. Pg 78 + if self.function == "triangular": self.kernel = [1 - zi for zi in zs] elif self.function == "uniform": @@ -697,6 +706,8 @@ def _eval_kernel(self): elif self.function == "gaussian": c = np.pi * 2 c = c ** (-0.5) + if self._normalize is False: + c = 1 self.kernel = [c * np.exp(-(zi**2) / 2.0) for zi in zs] else: print(("Unsupported kernel function", self.function)) diff --git a/libpysal/weights/tests/test_distance.py b/libpysal/weights/tests/test_distance.py index e32948d03..3d659b49d 100644 --- a/libpysal/weights/tests/test_distance.py +++ b/libpysal/weights/tests/test_distance.py @@ -351,6 +351,18 @@ def test_from_geodataframe(self): for k, v in list(w[self.known_wi5 - 1].items()): np.testing.assert_allclose(v, self.known_w5[k + 1], rtol=RTOL) + def test_w_normalize(self): + wg = d.Kernel.from_array(self.points, function="gaussian") + np.testing.assert_allclose( + wg.weights[0], + [0.3989422804014327, 0.35206533556593145, 0.3412334260702758], + rtol=RTOL, + ) + wgun = d.Kernel.from_array(self.points, function="gaussian", normalize=False) + np.testing.assert_allclose( + wgun.weights[0], [1.0, 0.8824969246470149, 0.8553453540369604], rtol=RTOL + ) + ########################## # Function/User tests # ##########################