Skip to content

Commit d4f9aa0

Browse files
committed
Add hash op implementation
1 parent 42b6671 commit d4f9aa0

File tree

6 files changed

+239
-0
lines changed

6 files changed

+239
-0
lines changed

cmake/external/xxhash.cmake

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
INCLUDE(ExternalProject)
2+
3+
set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
4+
set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
5+
set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
6+
7+
8+
ExternalProject_Add(
9+
extern_xxhash
10+
${EXTERNAL_PROJECT_LOG_ARGS}
11+
GIT_REPOSITORY "https://github.com/Cyan4973/xxHash"
12+
# eigen on cuda9.1 missing header of math_funtions.hpp
13+
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
14+
GIT_TAG "v0.6.5"
15+
PREFIX ${XXHASH_SOURCE_DIR}
16+
DOWNLOAD_NAME "xxhash"
17+
UPDATE_COMMAND ""
18+
CONFIGURE_COMMAND ""
19+
BUILD_IN_SOURCE 1
20+
PATCH_COMMAND
21+
BUILD_COMMAND make lib
22+
INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
23+
TEST_COMMAND ""
24+
)
25+
26+
27+
set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
28+
INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
29+
30+
add_library(xxhash STATIC IMPORTED GLOBAL)
31+
set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
32+
#if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
33+
# set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_xxhash_dummy.c)
34+
# file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
35+
# add_library(lib_xxhash STATIC ${dummyfile})
36+
#else()
37+
# add_library(lib_xxhash INTERFACE)
38+
#endif()
39+
include_directories(${XXHASH_INCLUDE_DIR})
40+
add_dependencies(xxhash extern_xxhash)
41+
#LIST(APPEND external_project_dependencies xxhash)
42+
#link_libraries(${XXHASH_LIBRARIES})
43+

paddle/fluid/operators/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
268268
else()
269269
set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
270270
endif()
271+
op_library(hash_op DEPS xxhash)
271272
op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows)
272273
op_library(sum_op DEPS selected_rows_functor)
273274
op_library(sgd_op DEPS selected_rows_functor)

paddle/fluid/operators/hash_op.cc

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include "paddle/fluid/operators/hash_op.h"
16+
#include <string>
17+
#include <vector>
18+
19+
namespace paddle {
20+
namespace operators {
21+
22+
class HashOp : public framework::OperatorWithKernel {
23+
public:
24+
HashOp(const std::string &type, const framework::VariableNameMap &inputs,
25+
const framework::VariableNameMap &outputs,
26+
const framework::AttributeMap &attrs)
27+
: OperatorWithKernel(type, inputs, outputs, attrs) {}
28+
29+
void InferShape(framework::InferShapeContext *ctx) const override {
30+
PADDLE_ENFORCE(ctx->HasInput("X"),
31+
"Input(X) of HashOp should not be null.");
32+
PADDLE_ENFORCE(ctx->HasOutput("Out"),
33+
"Output(Out) of HashOp should not be null.");
34+
35+
auto dims = ctx->GetInputDim("X");
36+
PADDLE_ENFORCE_EQ(dims.size(), 2UL,
37+
"The input of hash_op's dimensions must be 2");
38+
std::vector<int64_t> out_dims;
39+
out_dims.reserve(dims.size() + 1);
40+
// copy all dims except the last one
41+
for (size_t i = 0u; i != dims.size() - 1; ++i) {
42+
out_dims.emplace_back(dims[i]);
43+
}
44+
int num_hash = ctx->Attrs().Get<int>("num_hash");
45+
out_dims.emplace_back(num_hash);
46+
// keep the last dim to 1
47+
out_dims.emplace_back(1);
48+
49+
ctx->SetOutputDim("Out", dims);
50+
ctx->ShareLoD("X", /*->*/ "Out");
51+
}
52+
};
53+
54+
class HashOpMaker : public framework::OpProtoAndCheckerMaker {
55+
public:
56+
void Make() override {
57+
AddInput("X", "(Tensor) Input tensor of scale operator.");
58+
AddOutput("Out", "(Tensor) Output tensor of scale operator.");
59+
AddComment(R"DOC(
60+
**Hash Operator**
61+
$$Out = scale * X$$
62+
)DOC");
63+
AddAttr<int>("num_hash", "").SetDefault(1);
64+
AddAttr<int>("mod_by", "").SetDefault(100000);
65+
}
66+
};
67+
68+
} // namespace operators
69+
} // namespace paddle
70+
71+
namespace ops = paddle::operators;
72+
73+
REGISTER_OP_WITHOUT_GRADIENT(hash, ops::HashOp, ops::HashOpMaker);
74+
REGISTER_OP_CPU_KERNEL(hash, ops::HashKerel<int>, ops::HashKerel<int64_t>);

paddle/fluid/operators/hash_op.h

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#pragma once
16+
17+
extern "C" {
18+
#include <xxhash.h>
19+
}
20+
#include "paddle/fluid/framework/eigen.h"
21+
#include "paddle/fluid/framework/op_registry.h"
22+
23+
namespace paddle {
24+
namespace operators {
25+
// template <typename DeviceContext, typename T>
26+
template <typename T>
27+
class HashKerel : public framework::OpKernel<T> {
28+
public:
29+
virtual void Compute(const framework::ExecutionContext& context) const {
30+
auto* out_t = context.Output<framework::LoDTensor>("Out");
31+
auto* in_t = context.Input<framework::LoDTensor>("X");
32+
int mod_by = context.Attr<int>("mod_by");
33+
int num_hash = context.Attr<int>("num_hash");
34+
auto* output = out_t->mutable_data<T>(context.GetPlace());
35+
36+
auto in_dims = in_t->dims();
37+
auto in_lod = in_t->lod();
38+
PADDLE_ENFORCE_EQ(
39+
static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
40+
"The actual input data's size mismatched with LoD information.");
41+
42+
auto seq_length = in_dims[0];
43+
auto last_dim = in_dims[in_dims.size() - 1];
44+
auto* input = in_t->data<T>();
45+
for (int idx = 0; idx < seq_length; ++idx) {
46+
for (int ihash = 0; ihash != num_hash; ++ihash) {
47+
output[idx * num_hash + ihash] =
48+
XXH64(input, sizeof(int) * last_dim, ihash) % mod_by;
49+
}
50+
input += last_dim;
51+
}
52+
}
53+
};
54+
55+
} // namespace operators
56+
} // namespace paddle

python/paddle/fluid/layers/nn.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@
151151
'mul',
152152
'sigmoid_cross_entropy_with_logits',
153153
'maxout',
154+
'hash',
154155
]
155156

156157

@@ -7134,3 +7135,29 @@ def maxout(x, groups, name=None):
71347135
attrs={"groups": groups},
71357136
outputs={"Out": out})
71367137
return out
7138+
7139+
7140+
def hash(input, hash_size, num_hash=1, name=None):
7141+
"""
7142+
hash the input
7143+
Args:
7144+
input (Variable): The input variable which is a one-hot word.
7145+
hash_size (int): The space size for hash algorithm.
7146+
num_hash (int): The times of hash, default 1.
7147+
Returns:
7148+
Variable: The hash result variable which is a LoDTensor.
7149+
Examples:
7150+
.. code-block:: python
7151+
word_dict = paddle.dataset.imdb.word_dict()
7152+
x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
7153+
out = fluid.layers.hash(input=x, len(word_dict))
7154+
"""
7155+
helper = LayerHelper('hash', **locals())
7156+
out = helper.create_tmp_variable(helper.input_dtype(), stop_gradient=True)
7157+
helper.append_op(
7158+
type='hash',
7159+
inputs={'X': input},
7160+
outputs={'Out': out},
7161+
attrs={'num_hash': num_hash,
7162+
'mod_by': hash_size})
7163+
return out
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import unittest
16+
import numpy as np
17+
from op_test import OpTest
18+
19+
20+
class TestScaleOp(OpTest):
21+
def setUp(self):
22+
self.op_type = "hash"
23+
self.init_test_case()
24+
self.inputs = {'X': (self.in_seq, self.lod)}
25+
self.attrs = {'num_hash': 8, 'mod_by': 10000}
26+
self.outputs = {'Out': (self.out_seq, self.lod)}
27+
28+
def init_test_case(self):
29+
self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
30+
self.lod = [[9, 4, 11, 6]]
31+
self.out_seq = np.ones([30, 8], dtype=np.int32)
32+
33+
def test_check_output(self):
34+
self.check_output()
35+
36+
37+
if __name__ == "__main__":
38+
unittest.main()

0 commit comments

Comments
 (0)