2929
3030namespace triton { namespace backend { namespace dali { namespace test {
3131
32+ template <typename T, typename Op>
33+ void coalesced_compare (const std::vector<OBufferDescr> &obuffers,
34+ const std::vector<std::vector<T>> &ibuffers, size_t inp_size, const Op &op) {
35+ size_t inp_buff_i = 0 ;
36+ size_t inp_i = 0 ;
37+ size_t out_buff_i = 0 ;
38+ size_t out_i = 0 ;
39+ std::vector<T> obuffer;
40+ for (size_t i = 0 ; i < inp_size; ++i) {
41+ if (inp_i == ibuffers[inp_buff_i].size ()) {
42+ inp_i = 0 ;
43+ inp_buff_i++;
44+ }
45+ if (out_i == obuffers[out_buff_i].size / sizeof (T)) {
46+ out_i = 0 ;
47+ out_buff_i++;
48+ }
49+ if (out_i == 0 ) {
50+ auto descr = obuffers[out_buff_i];
51+ REQUIRE (descr.size % sizeof (T) == 0 );
52+ obuffer.resize (descr.size / sizeof (T));
53+ MemCopy (CPU, obuffer.data (), descr.device , descr.data , descr.size );
54+ }
55+ REQUIRE (obuffer[out_i] == op (ibuffers[inp_buff_i][inp_i]));
56+ out_i++;
57+ inp_i++;
58+ }
59+ }
60+
3261TEST_CASE (" Scaling Pipeline" ) {
3362 std::string pipeline_s ((const char *)pipelines::scale_pipeline_str,
3463 pipelines::scale_pipeline_len);
35- DaliPipeline pipeline (pipeline_s, 8 , 4 , 0 );
64+ DaliPipeline pipeline (pipeline_s, 256 , 4 , 0 );
3665 DaliExecutor executor (std::move (pipeline));
3766 std::mt19937 rand (1217 );
3867 std::uniform_real_distribution<float > dist (-1 .f , 1 .f );
3968 const std::string inp_name = " INPUT0" ;
40- auto scaling_test = [&](const std::vector<int > &batch_sizes) {
69+ auto scaling_test = [&](const std::vector<int > &batch_sizes,
70+ const std::vector<int > &out_batch_sizes,
71+ const std::vector<device_type_t > &out_devs) {
72+ REQUIRE (std::accumulate (batch_sizes.begin (), batch_sizes.end (), 0 ) ==
73+ std::accumulate (out_batch_sizes.begin (), out_batch_sizes.end (), 0 ));
74+ REQUIRE (out_devs.size () == out_batch_sizes.size ());
4175 std::vector<TensorListShape<>> shapes;
4276 for (auto batch_size : batch_sizes) {
4377 TensorListShape<> shape (batch_size, 2 );
@@ -53,33 +87,39 @@ TEST_CASE("Scaling Pipeline") {
5387 size_t inp_size = 0 ;
5488 for (auto &inp_buffer : input_buffers)
5589 inp_size += inp_buffer.size ();
56- std::vector<float > output_buffer (inp_size);
90+ std::vector<std::unique_ptr<IOBufferI>> output_buffers;
91+ int ti = 0 ;
92+ for (size_t out_i = 0 ; out_i < out_batch_sizes.size (); ++out_i) {
93+ int64_t buffer_vol = 0 ;
94+ for (int i = 0 ; i < out_batch_sizes[out_i]; ++i) {
95+ buffer_vol += volume (output[0 ].shape [ti]) * sizeof (float );
96+ ti++;
97+ }
98+ if (out_devs[out_i] == device_type_t ::CPU) {
99+ output_buffers.emplace_back (std::make_unique<IOBuffer<CPU>>(buffer_vol));
100+ } else {
101+ output_buffers.emplace_back (std::make_unique<IOBuffer<GPU>>(buffer_vol));
102+ }
103+ }
57104 std::vector<ODescr> output_vec (1 );
58105 auto &outdesc = output_vec[0 ];
59- OBufferDescr buf_descr;
60- buf_descr.device = device_type_t ::CPU;
61- buf_descr.data = output_buffer.data ();
62- buf_descr.size = output_buffer.size () * sizeof (decltype (output_buffer)::size_type);
63- outdesc.buffers = {buf_descr};
64- executor.PutOutputs (output_vec);
65- size_t out_i = 0 ;
66- int i = 0 ;
67- for (auto &inp_buffer : input_buffers) {
68- for (size_t i = 0 ; i < inp_buffer.size (); ++i) {
69- REQUIRE (output_buffer[out_i] == inp_buffer[i] * 2 );
70- ++out_i;
71- }
106+ for (auto &out_buffer : output_buffers) {
107+ outdesc.buffers .push_back (out_buffer->get_descr ());
72108 }
109+ executor.PutOutputs (output_vec);
110+ coalesced_compare (outdesc.buffers , input_buffers, inp_size, [](float a) { return a * 2 ; });
73111 };
74112
75113 SECTION (" Simple execute" ) {
76- scaling_test ({3 , 2 , 1 });
77- scaling_test ({5 });
114+ scaling_test ({3 , 2 , 1 }, { 6 }, {CPU} );
115+ scaling_test ({5 }, { 5 }, {GPU} );
78116 }
79117
80- SECTION (" Repeat batch size" ) {
81- scaling_test ({3 , 3 });
82- scaling_test ({6 });
118+ SECTION (" Chunked output" ) {
119+ scaling_test ({3 , 3 }, {3 , 3 }, {CPU, CPU});
120+ scaling_test ({6 }, {2 , 4 }, {GPU, GPU});
121+ scaling_test ({8 }, {6 , 2 }, {CPU, GPU});
122+ scaling_test ({64 }, {32 , 16 , 16 }, {CPU, GPU, GPU});
83123 }
84124}
85125
@@ -110,7 +150,7 @@ TEST_CASE("RN50 pipeline") {
110150 obuffer.device = device_type_t ::CPU;
111151 obuffer.device_id = 0 ;
112152 obuffer.data = output_buffer.data ();
113- obuffer.size = output_buffer.size () * sizeof (decltype (output_buffer)::size_type );
153+ obuffer.size = output_buffer.size () * sizeof (decltype (output_buffer)::value_type );
114154 outdesc.buffers = {obuffer};
115155 executor.PutOutputs (output_vec);
116156 for (int c = 0 ; c < output_c; ++c) {
0 commit comments