PaddlePaddle
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 3 deletions b/‎CMakeLists.txt
Lines changed: 1 addition & 3 deletions
diff --git a/‎paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
Lines changed: 0 additions & 1 deletion b/‎paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
Lines changed: 0 additions & 1 deletion
diff --git a/‎paddle/fluid/inference/api/demo_ci/test.cc
Lines changed: 0 additions & 99 deletions b/‎paddle/fluid/inference/api/demo_ci/test.cc
Lines changed: 0 additions & 99 deletions
diff --git a/‎paddle/fluid/inference/api/helper.h
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/inference/api/helper.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/operators/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/operators/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/operators/batch_norm_op.cu.cc
Lines changed: 0 additions & 21 deletions b/‎paddle/fluid/operators/batch_norm_op.cu.cc
Lines changed: 0 additions & 21 deletions
diff --git a/‎paddle/fluid/operators/conv_cudnn_op.cu.cc
Lines changed: 7 additions & 7 deletions b/‎paddle/fluid/operators/conv_cudnn_op.cu.cc
Lines changed: 7 additions & 7 deletions
diff --git a/‎paddle/fluid/operators/fetch_op.cc
Lines changed: 0 additions & 2 deletions b/‎paddle/fluid/operators/fetch_op.cc
Lines changed: 0 additions & 2 deletions
diff --git a/‎paddle/fluid/operators/label_smooth_op.cc
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/operators/label_smooth_op.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/operators/load_combine_op.cc
Lines changed: 27 additions & 24 deletions b/‎paddle/fluid/operators/load_combine_op.cc
Lines changed: 27 additions & 24 deletions
@@ -212,6 +212,7 @@ endif()
 
 
 include(external/threadpool)
+include(flags)              # set paddle compile flags
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
 
@@ -225,9 +226,6 @@ elseif()
     set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
 endif()
 
-include(flags)              # set paddle compile flags
-include(cudnn)              # set cudnn libraries, must before configure
-include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
 
@@ -135,7 +135,6 @@ void MainThreads(int num_threads, bool use_gpu) {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
-  FLAGS_dirname = "./word2vec.inference.model";
   google::ParseCommandLineFlags(&argc, &argv, true);
   paddle::demo::Main(false /* use_gpu*/);
   paddle::demo::MainThreads(1, false /* use_gpu*/);
 
@@ -97,7 +97,7 @@ static void TensorAssignData(PaddleTensor *tensor,
 }
 
 template <typename T>
-static int ZeroCopyTensorAssignData(paddle::ZeroCopyTensor *tensor,
+static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
                                     const std::vector<std::vector<T>> &data) {
   int size{0};
   auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
 
@@ -291,7 +291,7 @@ op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
-op_library(parallel_do_op DEPS executor glog)
+op_library(parallel_do_op DEPS executor)
 op_library(unsqueeze_op DEPS reshape_op)
 op_library(squeeze_op DEPS reshape_op)
 op_library(extract_rows_op DEPS memory)
 
@@ -141,27 +141,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
           bias->template data<BatchNormParamType<T>>(),
           est_mean->template data<BatchNormParamType<T>>(),
           est_var->template data<BatchNormParamType<T>>(), epsilon));
-
-      VLOG(3) << "before tensor copy";
-      Tensor mean_, var_, x_, y_;
-      framework::TensorCopy(*est_mean, platform::CPUPlace(), dev_ctx, &mean_);
-      framework::TensorCopy(*est_var, platform::CPUPlace(), dev_ctx, &var_);
-      framework::TensorCopy(*x, platform::CPUPlace(), dev_ctx, &x_);
-      framework::TensorCopy(*y, platform::CPUPlace(), dev_ctx, &y_);
-      VLOG(3) << "after tensor copy";
-      auto check_tensor = [&](const Tensor& check) {
-      float sum = .0;
-      for(size_t i=0; i < check.numel(); ++i) {
-          sum += check.data<float>()[i];
-      }
-      return sum;
-      };
-      VLOG(3) << "BatchNormKernel";
-      VLOG(3) << "mean" << check_tensor(mean_);
-      VLOG(3) << "var" << check_tensor(var_);
-      VLOG(3) << "x" << check_tensor(x_);
-      VLOG(3) << "y" << check_tensor(y_);
-      
     } else {
       // Run training mode.
       // obtain running mean and running inv var, and see if we need to
 
@@ -43,7 +43,6 @@ template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    VLOG(3) << "inside cudnn";
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "It must use CUDAPlace.");
     auto* input = ctx.Input<Tensor>("Input");
@@ -60,7 +59,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    VLOG(3) << "get all inputs";
+
     // ------------------- cudnn descriptors ---------------------
     ScopedTensorDescriptor input_desc;
     ScopedTensorDescriptor output_desc;
@@ -73,7 +72,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(paddings, strides, dilations);
-    VLOG(3) << "create tensor descriptor";
+
 #if CUDNN_VERSION_MIN(7, 0, 1)
     // cudnn 7 can support groups, no need to do it mannually
     // FIXME(typhoonzero): find a better way to disable groups
@@ -82,7 +81,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
         cudnn_conv_desc, groups));
     groups = 1;
 #endif
-    VLOG(3) << "before create tensor descriptor";
+
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()), groups);
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
@@ -112,7 +111,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
       output_height = output->dims()[2];
       output_width = output->dims()[3];
     }
-    VLOG(3) << "after create tensor descriptor";
+
     int group_offset_in =
         input_channels / groups * input_height * input_width * input_depth;
     int group_offset_out =
@@ -129,7 +128,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
 
-    VLOG(3) << "set cudnn algorithm";
     CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
@@ -150,7 +148,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
           cudnn_conv_desc, CUDNN_DEFAULT_MATH));
     }
 #endif
-    VLOG(3) << "before get workspace";
+
     // get workspace size able to allocate
     CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@@ -159,6 +157,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // the limit because the algo is overrided to use tensor core.
     PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                       "workspace_size to be allocated exceeds the limit");
+
     // ------------------- cudnn conv forward ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     for (int i = 0; i < groups; i++) {
@@ -312,6 +311,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
               cudnn_filter_desc, filter_algo, &tmp_size));
       workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
     }
+
     // ------------------- cudnn conv backward data ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
 
@@ -42,8 +42,6 @@ class FetchOp : public framework::OperatorBase {
                    "Cannot find out_var in scope, out_var_name is %s",
                    out_name);
 
-    VLOG(3) << "fetch_var ptr " << fetch_var << " is " << (fetch_var == nullptr);
-    VLOG(3) << "out_var ptr " << out_var << " is " << (out_var == nullptr);
     auto col = static_cast<size_t>(Attr<int>("col"));
 
     auto *fetch_list = out_var->GetMutable<framework::FeedFetchList>();
 
@@ -34,7 +34,7 @@ class LabelSmoothOp : public framework::OperatorWithKernel {
     auto in_dims = ctx->GetInputDim("X");
     if (ctx->HasInput("PriorDist")) {
       auto noise_dims = ctx->GetInputDim("PriorDist");
-      int64_t noise_numel = paddle::framework::product(noise_dims);
+      auto noise_numel = paddle::framework::product(noise_dims);
       PADDLE_ENFORCE(
           in_dims[1] == noise_numel,
           "The number of elements in Input(PriorDist) must be equal to the "
 
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
-#include <vector>
+#include <memory>
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -33,10 +33,15 @@ class LoadCombineOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto load_as_fp16 = Attr<bool>("load_as_fp16");
-
-    std::ifstream fin(filename, std::ios_base::in | std::ios_base::binary);
-    //std::ifstream fin(filename, std::ios_base::in);
-    PADDLE_ENFORCE(!fin.bad(),
+    auto format = Attr<std::string>("format");
+    std::unique_ptr<std::ifstream> fin;
+    if (format == "windows") {
+      fin.reset(new std::ifstream(filename,
+                                  std::ios_base::in | std::ios_base::binary));
+    } else {
+      fin.reset(new std::ifstream(filename));
+    }
+    PADDLE_ENFORCE(static_cast<bool>(*fin),
                    "Cannot open file %s for load_combine op", filename);
 
     auto out_var_names = Outputs("Out");
@@ -48,32 +53,20 @@ class LoadCombineOp : public framework::OperatorBase {
     auto &dev_ctx = *pool.Get(place);
 
     for (size_t i = 0; i < out_var_names.size(); i++) {
-      VLOG(3) << "load variable " << out_var_names[i];
       auto *out_var = scope.FindVar(out_var_names[i]);
 
       PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
                      out_var_names[i]);
 
       auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-      VLOG(3) << "Get Tensor";
+
       // Error checking
-      PADDLE_ENFORCE(!fin.bad(), "Cannot read more from file %s",
+      PADDLE_ENFORCE(static_cast<bool>(*fin), "Cannot read more from file %s",
                      filename);
-      VLOG(3) << "before deserialization";
+
       // Get data from fin to tensor
-      DeserializeFromStream(fin, tensor, dev_ctx); 
-      // VLOG(3) << "after deserialization";
-      // framework::Tensor check;
-      // framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
-      // float sum = .0;
-      // for(size_t i=0; i < check.numel(); ++i) {
-      //   if(std::type_index(check.type()) == std::type_index(typeid(int64_t))) {
-      //     sum += static_cast<float>(check.data<int64_t>()[i]);
-      //   } else {
-      //     sum += check.data<float>()[i];
-      //   }
-      // }
-      // VLOG(3) << "sum result" << sum;
+      DeserializeFromStream(*fin, tensor, dev_ctx);
+
       auto in_dtype = framework::ToDataType(tensor->type());
       auto out_dtype =
           load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
@@ -93,9 +86,7 @@ class LoadCombineOp : public framework::OperatorBase {
         tensor = out_var->GetMutable<framework::LoDTensor>();
         tensor->set_lod(fp16_tensor.lod());
         tensor->ShareDataWith(fp16_tensor);
-
       }
-      VLOG(3) << "load " << out_var_names[i] << " finished";
     }
   }
 };
@@ -119,6 +110,18 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                          "LoDTensors will be loaded from \"file_path\".")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
+    AddAttr<std::string>("format",
+                         R"DOC((windows|linux)" "saved model file format
+                         windows and linux file newline symbol is
+different. windows(newline is \n\r) or linux(newline is \r)
+So if you set attribute format to windows, then we saved model file in binary.
+It can be used both linux and windows. If you set format to linux,
+it will save file in normal file, newline symbol is \r. Need to note
+that these two format is not inter-compatible.)DOC")
+        .SetDefault("linux")
+        .AddCustomChecker([](const std::string &s) {
+          return s == "windows" || s == "linux";
+        });
     AddComment(R"DOC(
 LoadCombine Operator.
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ static void TensorAssignData(PaddleTensor *tensor,`
`97`	`97`	`}`
`98`	`98`
`99`	`99`	`template <typename T>`
`100`		`-static int ZeroCopyTensorAssignData(paddle::ZeroCopyTensor *tensor,`
	`100`	`+static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,`
`101`	`101`	`const std::vector<std::vector<T>> &data) {`
`102`	`102`	`int size{0};`
`103`	`103`	`auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);`