16
16
#include < algorithm>
17
17
#include < random>
18
18
#include < unordered_map>
19
+ #include < unordered_set>
19
20
#include " google/protobuf/io/zero_copy_stream_impl.h"
20
21
#include " google/protobuf/message.h"
21
22
#include " google/protobuf/text_format.h"
@@ -45,9 +46,7 @@ DatasetImpl<T>::DatasetImpl() {
45
46
fleet_send_batch_size_ = 1024 ;
46
47
fleet_send_sleep_seconds_ = 0 ;
47
48
merge_by_insid_ = false ;
48
- erase_duplicate_feas_ = true ;
49
- keep_unmerged_ins_ = true ;
50
- min_merge_size_ = 2 ;
49
+ merge_size_ = 2 ;
51
50
parse_ins_id_ = false ;
52
51
parse_content_ = false ;
53
52
preload_thread_num_ = 0 ;
@@ -118,15 +117,10 @@ void DatasetImpl<T>::SetParseContent(bool parse_content) {
118
117
}
119
118
120
119
template <typename T>
121
- void DatasetImpl<T>::SetMergeByInsId(
122
- const std::vector<std::string>& merge_slot_list, bool erase_duplicate_feas,
123
- int min_merge_size, bool keep_unmerged_ins) {
120
+ void DatasetImpl<T>::SetMergeByInsId(int merge_size) {
124
121
merge_by_insid_ = true ;
125
122
parse_ins_id_ = true ;
126
- merge_slots_list_ = merge_slot_list;
127
- erase_duplicate_feas_ = erase_duplicate_feas;
128
- min_merge_size_ = min_merge_size;
129
- keep_unmerged_ins_ = keep_unmerged_ins;
123
+ merge_size_ = merge_size;
130
124
}
131
125
132
126
template <typename T>
@@ -643,22 +637,11 @@ void MultiSlotDataset::MergeByInsId() {
643
637
return ;
644
638
}
645
639
auto multi_slot_desc = data_feed_desc_.multi_slot_desc ();
646
- std::unordered_map<int , bool > merge_slots;
647
640
std::vector<std::string> use_slots;
648
- std::vector<bool > use_slots_is_dense;
649
641
for (size_t i = 0 ; i < multi_slot_desc.slots_size (); ++i) {
650
642
const auto & slot = multi_slot_desc.slots (i);
651
643
if (slot.is_used ()) {
652
644
use_slots.push_back (slot.name ());
653
- use_slots_is_dense.push_back (slot.is_dense ());
654
- }
655
- }
656
- for (size_t i = 0 ; i < use_slots.size (); ++i) {
657
- // currently, we don't merge dense slots
658
- if (std::find (merge_slots_list_.begin (), merge_slots_list_.end (),
659
- use_slots[i]) != merge_slots_list_.end () &&
660
- !use_slots_is_dense[i]) {
661
- merge_slots[i] = true ;
662
645
}
663
646
}
664
647
CHECK (multi_output_channel_.size () != 0 ); // NOLINT
@@ -682,134 +665,82 @@ void MultiSlotDataset::MergeByInsId() {
682
665
return a.ins_id_ < b.ins_id_ ;
683
666
});
684
667
685
- auto sort_cmp_uint64 = [&merge_slots](const FeatureItem& a,
686
- const FeatureItem& b) {
687
- auto & a_sign = a.sign ().uint64_feasign_ ;
688
- auto & b_sign = b.sign ().uint64_feasign_ ;
689
- return a_sign < b_sign || (a_sign == b_sign && a.slot () < b.slot ());
690
- };
691
- auto sort_cmp_float = [&merge_slots](const FeatureItem& a,
692
- const FeatureItem& b) {
693
- auto & a_sign = a.sign ().float_feasign_ ;
694
- auto & b_sign = b.sign ().float_feasign_ ;
695
- return a_sign < b_sign || (a_sign == b_sign && a.slot () < b.slot ());
696
- };
697
- auto unique_eq_uint64 = [&merge_slots](const FeatureItem& a,
698
- const FeatureItem& b) {
699
- if (a.slot () == b.slot () &&
700
- merge_slots.find (a.slot ()) == merge_slots.end ()) {
701
- return true ;
702
- }
703
- auto & a_sign = a.sign ().uint64_feasign_ ;
704
- auto & b_sign = b.sign ().uint64_feasign_ ;
705
- return a_sign == b_sign && a.slot () == b.slot ();
706
- };
707
- auto unique_eq_float = [&merge_slots](const FeatureItem& a,
708
- const FeatureItem& b) {
709
- if (a.slot () == b.slot () &&
710
- merge_slots.find (a.slot ()) == merge_slots.end ()) {
711
- return true ;
712
- }
713
- auto & a_sign = a.sign ().float_feasign_ ;
714
- auto & b_sign = b.sign ().float_feasign_ ;
715
- return a_sign == b_sign && a.slot () == b.slot ();
716
- };
717
-
718
668
std::vector<Record> results;
669
+ uint64_t drop_ins_num = 0 ;
670
+ std::unordered_set<uint16_t > all_int64;
671
+ std::unordered_set<uint16_t > all_float;
672
+ std::unordered_set<uint16_t > local_uint64;
673
+ std::unordered_set<uint16_t > local_float;
674
+
719
675
VLOG (3 ) << " recs.size() " << recs.size ();
720
676
for (size_t i = 0 ; i < recs.size ();) {
721
677
size_t j = i + 1 ;
722
678
while (j < recs.size () && recs[j].ins_id_ == recs[i].ins_id_ ) {
723
679
j++;
724
680
}
725
- if (j - i < min_merge_size_) {
726
- if (keep_unmerged_ins_) {
727
- for (size_t k = i; k < j; ++k) {
728
- results.push_back (std::move (recs[k]));
729
- }
730
- }
681
+ if (merge_size_ > 0 && j - i != merge_size_) {
682
+ drop_ins_num += j - i;
683
+ LOG (WARNING) << " drop ins " << recs[i].ins_id_ << " size=" << j - i
684
+ << " , because merge_size=" << merge_size_;
731
685
i = j;
732
686
continue ;
733
687
}
734
688
735
- std::vector<FeatureItem> merge_uint64_feasigns;
736
- std::vector<FeatureItem> merge_float_feasigns;
737
- Record rec = std::move (recs[i]);
689
+ all_int64.clear ();
690
+ all_float.clear ();
691
+ bool has_conflict_slot = false ;
692
+ uint16_t conflict_slot = 0 ;
693
+
694
+ Record rec;
695
+ rec.ins_id_ = recs[i].ins_id_ ;
696
+ rec.content_ = recs[i].content_ ;
738
697
739
- for (size_t k = i + 1 ; k < j; k++) {
698
+ for (size_t k = i; k < j; k++) {
699
+ local_uint64.clear ();
700
+ local_float.clear ();
740
701
for (auto & feature : recs[k].uint64_feasigns_ ) {
741
- if (merge_slots.find (feature.slot ()) != merge_slots.end ()) {
742
- merge_uint64_feasigns.push_back (std::move (feature));
702
+ uint16_t slot = feature.slot ();
703
+ if (all_int64.find (slot) != all_int64.end ()) {
704
+ has_conflict_slot = true ;
705
+ conflict_slot = slot;
706
+ break ;
743
707
}
708
+ local_uint64.insert (slot);
709
+ rec.uint64_feasigns_ .push_back (std::move (feature));
710
+ }
711
+ if (has_conflict_slot) {
712
+ break ;
744
713
}
714
+ all_int64.insert (local_uint64.begin (), local_uint64.end ());
715
+
745
716
for (auto & feature : recs[k].float_feasigns_ ) {
746
- if (merge_slots.find (feature.slot ()) != merge_slots.end ()) {
747
- merge_float_feasigns.push_back (std::move (feature));
717
+ uint16_t slot = feature.slot ();
718
+ if (all_float.find (slot) != all_float.end ()) {
719
+ has_conflict_slot = true ;
720
+ conflict_slot = slot;
721
+ break ;
748
722
}
723
+ local_float.insert (slot);
724
+ rec.float_feasigns_ .push_back (std::move (feature));
725
+ }
726
+ if (has_conflict_slot) {
727
+ break ;
749
728
}
750
- recs[k] = Record ( );
729
+ all_float. insert (local_float. begin (), local_float. end () );
751
730
}
752
- i = j;
753
731
754
- if (!erase_duplicate_feas_) {
755
- rec.uint64_feasigns_ .insert (rec.uint64_feasigns_ .end (),
756
- merge_uint64_feasigns.begin (),
757
- merge_uint64_feasigns.end ());
758
- rec.float_feasigns_ .insert (rec.float_feasigns_ .end (),
759
- merge_float_feasigns.begin (),
760
- merge_float_feasigns.end ());
732
+ if (has_conflict_slot) {
733
+ LOG (WARNING) << " drop ins " << recs[i].ins_id_ << " size=" << j - i
734
+ << " , because conflict_slot=" << use_slots[conflict_slot];
735
+ drop_ins_num += j - i;
761
736
} else {
762
- std::vector<FeatureItem> not_merge_uint64_feasigns;
763
- std::vector<FeatureItem> not_merge_float_feasigns;
764
-
765
- for (auto & feature : rec.uint64_feasigns_ ) {
766
- if (merge_slots.find (feature.slot ()) != merge_slots.end ()) {
767
- merge_uint64_feasigns.push_back (std::move (feature));
768
- } else {
769
- not_merge_uint64_feasigns.push_back (std::move (feature));
770
- }
771
- }
772
- for (auto & feature : rec.float_feasigns_ ) {
773
- if (merge_slots.find (feature.slot ()) != merge_slots.end ()) {
774
- merge_float_feasigns.push_back (std::move (feature));
775
- } else {
776
- not_merge_float_feasigns.push_back (std::move (feature));
777
- }
778
- }
779
- rec.uint64_feasigns_ .clear ();
780
- rec.float_feasigns_ .clear ();
781
-
782
- // erase duplicate uint64 feasigns
783
- std::sort (merge_uint64_feasigns.begin (), merge_uint64_feasigns.end (),
784
- sort_cmp_uint64);
785
- merge_uint64_feasigns.erase (
786
- std::unique (merge_uint64_feasigns.begin (),
787
- merge_uint64_feasigns.end (), unique_eq_uint64),
788
- merge_uint64_feasigns.end ());
789
- rec.uint64_feasigns_ .insert (rec.uint64_feasigns_ .end (),
790
- merge_uint64_feasigns.begin (),
791
- merge_uint64_feasigns.end ());
792
- rec.uint64_feasigns_ .insert (rec.uint64_feasigns_ .end (),
793
- not_merge_uint64_feasigns.begin (),
794
- not_merge_uint64_feasigns.end ());
795
-
796
- // erase duplicate float feasigns
797
- std::sort (merge_float_feasigns.begin (), merge_float_feasigns.end (),
798
- sort_cmp_float);
799
- merge_float_feasigns.erase (
800
- std::unique (merge_float_feasigns.begin (), merge_float_feasigns.end (),
801
- unique_eq_float),
802
- merge_float_feasigns.end ());
803
- rec.float_feasigns_ .insert (rec.float_feasigns_ .end (),
804
- merge_float_feasigns.begin (),
805
- merge_float_feasigns.end ());
806
- rec.float_feasigns_ .insert (rec.float_feasigns_ .end (),
807
- not_merge_float_feasigns.begin (),
808
- not_merge_float_feasigns.end ());
737
+ results.push_back (std::move (rec));
809
738
}
810
- results. push_back (rec) ;
739
+ i = j ;
811
740
}
741
+ std::vector<Record>().swap (recs);
812
742
VLOG (3 ) << " results size " << results.size ();
743
+ LOG (WARNING) << " total drop ins num: " << drop_ins_num;
813
744
results.shrink_to_fit ();
814
745
815
746
auto fleet_ptr = FleetWrapper::GetInstance ();
0 commit comments