Skip to content

Commit 73db2e1

Browse files
committed
Fixes #435
1 parent 6239db1 commit 73db2e1

File tree

3 files changed

+75
-39
lines changed

3 files changed

+75
-39
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# Changelog
22
# 7.035 - UNRELEASED
33
* Latest dtype-next (10.124) - contains upgrades to ham-fisted which allow pmap et al. to accept arbitrary executor services.
4+
* Fix for [issue 438](https://github.com/techascent/tech.ml.dataset/issues/438) - keyword dataset names in tribuo.
5+
* Fix for [issue 435](https://github.com/techascent/tech.ml.dataset/issues/435) - pd-merge's outer must accept empty datasets.
6+
47

58

69
# 7.034

src/tech/v3/dataset/join.clj

Lines changed: 64 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,32 @@
346346
(-> (hash-join colname lhs rhs (assoc options :lhs-missing? true))
347347
:left-outer)))
348348

349+
(defn- col-or-data->reader
350+
([tuple-data ds])
351+
([tuple-data ds outer?]
352+
;;Else not having the column is an error
353+
(if (and (sequential? tuple-data)
354+
(not= 1 (count tuple-data)))
355+
(-> (ds-base/select-columns ds tuple-data)
356+
(ds-readers/value-reader {:copying? true}))
357+
(let [tuple-data (if (sequential? tuple-data)
358+
(first tuple-data)
359+
tuple-data)]
360+
(if outer?
361+
(get ds tuple-data [])
362+
(ds-base/column ds tuple-data))))))
363+
364+
(defn- ensure-sequential
365+
[colname]
366+
(if-not (sequential? colname) [colname] colname))
367+
368+
(defn- filter-columns
369+
[ds collist outer?]
370+
(when collist
371+
(if outer?
372+
(vec (filter (set (ds-base/column-names ds)) collist))
373+
collist)))
374+
349375

350376
(defn pd-merge
351377
"Pandas-style [merge](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html).
@@ -419,15 +445,6 @@ outer-join [8 4]:
419445
([left-ds right-ds options]
420446
(let [lhs-table-name (default-table-name left-ds "left")
421447
rhs-table-name (default-table-name right-ds "right")
422-
col-or-data->reader
423-
(fn [tuple-data ds]
424-
(if (and (sequential? tuple-data)
425-
(not= 1 (count tuple-data)))
426-
(-> (ds-base/select-columns ds tuple-data)
427-
(ds-readers/value-reader {:copying? true}))
428-
(if (sequential? tuple-data)
429-
(ds-base/column ds (first tuple-data))
430-
(ds-base/column ds tuple-data))))
431448
how (get options :how :inner)]
432449
(if (identical? how :cross)
433450
(do
@@ -453,10 +470,10 @@ outer-join [8 4]:
453470
[lhs-table-name lhs-columns]
454471
[rhs-table-name rhs-columns]))
455472
(update-join-metadata lhs-table-name rhs-table-name))))
456-
(let [left-on (get options :left-on (get options :on))
457-
right-on (get options :right-on (get options :on))
458-
left-on (when left-on (if-not (sequential? left-on) [left-on] left-on))
459-
right-on (when right-on (if-not (sequential? right-on) [right-on] right-on))
473+
(let [left-on (ensure-sequential (get options :left-on (get options :on)))
474+
right-on (ensure-sequential (get options :right-on (get options :on)))
475+
476+
outer? (identical? :outer (get options :how))
460477
on-int (->> (concat left-on right-on)
461478
(filter (set/intersection (set left-on) (set right-on)))
462479
(distinct)
@@ -465,8 +482,10 @@ outer-join [8 4]:
465482
(== (count left-on) (count right-on))
466483
"Number of left join columns (%d) doesn't equal number of right join columns %d"
467484
(count left-on) (count right-on))
468-
left-join-data (col-or-data->reader left-on left-ds)
469-
right-join-data (col-or-data->reader right-on right-ds)
485+
left-on (filter-columns left-ds left-on outer?)
486+
right-on (filter-columns right-ds right-on outer?)
487+
left-join-data (col-or-data->reader left-on left-ds outer?)
488+
right-join-data (col-or-data->reader right-on right-ds outer?)
470489

471490

472491
{:keys [lhs-indexes rhs-indexes lhs-missing rhs-missing]}
@@ -524,30 +543,36 @@ outer-join [8 4]:
524543
[rhs-table-name rhs-cols]))
525544
(update-join-metadata lhs-table-name rhs-table-name)))
526545
:outer
527-
(let [n-left-empty (count rhs-missing)
528-
n-right-empty (count lhs-missing)
529-
;;Order is intersection, left-missing, right-missing
530-
lhs-indexes (add-all! (dtype/clone lhs-indexes) lhs-missing)
531-
left-valid (ds-base/select-rows left-ds lhs-indexes)
532-
right-valid (ds-base/select-rows right-ds rhs-indexes)
533-
right-missing (ds-base/select-rows right-ds rhs-missing)
534-
;;For the columns we perhaps joined on
535-
intersection-ds (-> (ds-base/select-columns left-valid on-int)
536-
(ds-base/concat-copying (ds-base/select-columns
537-
right-missing on-int)))
538-
left-full (-> (ds-base/remove-columns left-valid on-int)
539-
(ds-base/extend-with-empty n-left-empty))
540-
right-full (-> (ds-base/remove-columns right-valid on-int)
541-
(ds-base/extend-with-empty n-right-empty)
542-
(ds-base/concat-copying (ds-base/remove-columns
543-
right-missing on-int)))]
544-
(-> (ds-impl/new-dataset
545-
"outer-join"
546-
(nice-column-names
547-
[lhs-table-name (concat (ds-base/columns intersection-ds)
548-
(ds-base/columns left-full))]
549-
[rhs-table-name (ds-base/columns right-full)]))
550-
(update-join-metadata lhs-table-name rhs-table-name))))))))
546+
(cond
547+
(== 0 (ds-base/row-count left-ds))
548+
(vary-meta right-ds assoc :name "outer-join")
549+
(== 0 (ds-base/row-count right-ds))
550+
(vary-meta left-ds assoc :name "outer-join")
551+
:else
552+
(let [n-left-empty (count rhs-missing)
553+
n-right-empty (count lhs-missing)
554+
;;Order is intersection, left-missing, right-missing
555+
lhs-indexes (add-all! (dtype/clone lhs-indexes) lhs-missing)
556+
left-valid (ds-base/select-rows left-ds lhs-indexes)
557+
right-valid (ds-base/select-rows right-ds rhs-indexes)
558+
right-missing (ds-base/select-rows right-ds rhs-missing)
559+
;;For the columns we perhaps joined on
560+
intersection-ds (-> (ds-base/select-columns left-valid on-int)
561+
(ds-base/concat-copying (ds-base/select-columns
562+
right-missing on-int)))
563+
left-full (-> (ds-base/remove-columns left-valid on-int)
564+
(ds-base/extend-with-empty n-left-empty))
565+
right-full (-> (ds-base/remove-columns right-valid on-int)
566+
(ds-base/extend-with-empty n-right-empty)
567+
(ds-base/concat-copying (ds-base/remove-columns
568+
right-missing on-int)))]
569+
(-> (ds-impl/new-dataset
570+
"outer-join"
571+
(nice-column-names
572+
[lhs-table-name (concat (ds-base/columns intersection-ds)
573+
(ds-base/columns left-full))]
574+
[rhs-table-name (ds-base/columns right-full)]))
575+
(update-join-metadata lhs-table-name rhs-table-name)))))))))
551576
([left-ds right-ds]
552577
(pd-merge left-ds right-ds {:on (set/intersection
553578
(set (ds-base/column-names left-ds))

test/tech/v3/dataset/join_test.clj

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,3 +391,11 @@
391391
(is (= #{:product :customer}
392392
(set (ds/column-names mm))))))
393393

394+
395+
(deftest pd-merge-issue-435
396+
(is (ds-join/pd-merge (ds/empty-dataset)
397+
(ds/->dataset {:t [0 1] :x [:a :b]})
398+
{:on :t :how :outer}))
399+
(is (ds-join/pd-merge (ds/->dataset {:t [0 1] :x [:a :b]})
400+
(ds/empty-dataset)
401+
{:on :t :how :outer})))

0 commit comments

Comments
 (0)