|
12 | 12 |
|
13 | 13 | from ._accessor import (DelegatedMethod, DelegatedProperty,
|
14 | 14 | delegated_method)
|
15 |
| -from ._utils import combine, pack, unpack |
| 15 | +from ._utils import combine, pack, unpack, refactorize |
16 | 16 | from .common import _U8_MAX, _IPv4_MAX
|
17 | 17 | from .parser import _to_ipaddress_pyint, _as_ip_object
|
18 | 18 |
|
@@ -69,6 +69,10 @@ def __init__(self, values):
|
69 | 69 | values = _to_ip_array(values) # TODO: avoid potential copy
|
70 | 70 | self.data = values
|
71 | 71 |
|
| 72 | + @classmethod |
| 73 | + def _constructor_from_sequence(cls, scalars): |
| 74 | + return cls(scalars) |
| 75 | + |
72 | 76 | # -------------------------------------------------------------------------
|
73 | 77 | # Pandas Interface
|
74 | 78 | # -------------------------------------------------------------------------
|
@@ -287,7 +291,7 @@ def equals(self, other):
|
287 | 291 |
|
288 | 292 | def isna(self):
|
289 | 293 | ips = self.data
|
290 |
| - return (ips['lo'] == 0) & (ips['lo'] - ips['hi'] == 0) |
| 294 | + return (ips['lo'] == 0) & (ips['hi'] == 0) |
291 | 295 |
|
292 | 296 | def argsort(self, axis=-1, kind='quicksort', order=None):
|
293 | 297 | return self.data.argsort()
|
@@ -460,16 +464,67 @@ def unique(self):
|
460 | 464 | data = self.data.take(np.sort(indices))
|
461 | 465 | return self._from_ndarray(data)
|
462 | 466 |
|
463 |
| - def factorize(self, sort=False): |
464 |
| - # XXX: Verify this, check for better algo |
465 |
| - uniques, indices, labels = np.unique(self.data, |
466 |
| - return_index=True, |
467 |
| - return_inverse=True) |
468 |
| - if not sort: |
469 |
| - # Unsort, since np.unique sorts |
470 |
| - uniques = self._from_ndarray(self.data.take(np.sort(indices))) |
471 |
| - labels = np.argsort(uniques.data).take(labels) |
472 |
| - return labels, uniques |
| 467 | + def factorize(self, na_sentinel=-1): |
| 468 | + """Factorize an IPArray into integer labels and unique values. |
| 469 | +
|
| 470 | + Calling :meth:`pandas.Series.factorize` or :meth:`pandas.factorize` |
| 471 | + will dispatch to this method. |
| 472 | +
|
| 473 | + Parameters |
| 474 | + ---------- |
| 475 | + na_sentinel : int, default -1 |
| 476 | + The value in `labels` to use for indicating missing values in |
| 477 | + `self`. |
| 478 | +
|
| 479 | + Returns |
| 480 | + ------- |
| 481 | + labels : ndarray |
| 482 | + An integer-type ndarray the same length as `self`. Each newly- |
| 483 | + observed value in `self` will be assigned the next integer. |
| 484 | + Missing values in self are assigned `na_sentinel`. |
| 485 | + uniques : IPArray |
| 486 | + The unique values in `self` in order of appereance, not including |
| 487 | + the missing value ``IPv4Address('0.0.0.0')``. |
| 488 | +
|
| 489 | + See Also |
| 490 | + -------- |
| 491 | + pandas.factorize, pandas.Series.factorize |
| 492 | +
|
| 493 | + Examples |
| 494 | + -------- |
| 495 | + >>> arr = IPArray([2, 2, 0, 1, 2, 2**64 + 1]) |
| 496 | + >>> arr |
| 497 | + IPArray(['0.0.0.2', '0.0.0.2', '0.0.0.0', '0.0.0.1', |
| 498 | + '0.0.0.2', '::1:0:0:0:1']) |
| 499 | +
|
| 500 | + >>> labels, uniques = arr.factorize() |
| 501 | + >>> labels |
| 502 | + array([ 0, 0, -1, 1, 0, 2]) |
| 503 | +
|
| 504 | + Notice that `uniques` does not include the missing value. |
| 505 | + >>> uniques |
| 506 | + IPArray(['0.0.0.2', '0.0.0.1', '::1:0:0:0:1']) |
| 507 | + """ |
| 508 | + # OK, so here's the plan. |
| 509 | + # Start with factorizing `self.data`, which has two unfortunate issues |
| 510 | + # 1. Requires casting to object. |
| 511 | + # 2. Gets the NA logic wrong, since (0, 0) isn't NA to pandas. |
| 512 | + # For now, we can't help with 1. Maybe someday. |
| 513 | + # For 2, we can "fix" things with a little post-factorization cleanup. |
| 514 | + l, u = pd.factorize(self.data) |
| 515 | + mask = self.isna() |
| 516 | + any_na = mask.any() |
| 517 | + |
| 518 | + if any_na: |
| 519 | + first_na = mask.argmax() |
| 520 | + refactorize(l, first_na, na_sentinel=na_sentinel) # inplace op |
| 521 | + |
| 522 | + # u is an ndarray of tuples. Go to our record type, then an IPArray |
| 523 | + u2 = type(self)((u.astype(self.dtype._record_type))) |
| 524 | + # May have a missing value. |
| 525 | + if any_na: |
| 526 | + u2 = u2[~u2.isna()] |
| 527 | + return l, u2 |
473 | 528 |
|
474 | 529 |
|
475 | 530 | # -----
|
|
0 commit comments