Skip to content

Commit 0e79c9b

Browse files
committed
[MINOR] Correct dedub builtin docs
1 parent 9a7c3c7 commit 0e79c9b

File tree

4 files changed

+76
-5
lines changed

4 files changed

+76
-5
lines changed

scripts/builtin/dedup.dml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@
2828
#
2929
# INPUT:
3030
# --------------------------------------------------------------------------------------
31-
# X Input Frame[String] with n rows and d columns (raw tuples)
32-
# gloveMatrix Matrix[Double] of size |V| × e (pretrained GloVe embeddings) -> |V| number of words and e = embedding dimesnion
33-
# vocab Frame[String] of size |V| × 1 (vocabulary aligned with gloveMatrix)
34-
# similarityMeasure (optional) String specifying similarity metric: "cosine", "euclidean"
35-
# threshold (optional) Double: threshold value above which tuples are considered duplicates
31+
# X Input Frame[String] with n rows and d columns (raw tuples)
32+
# gloveMatrix Matrix[Double] of size |V| × e (pretrained GloVe embeddings) -> |V| number of words and e = embedding dimesnion
33+
# vocab Frame[String] of size |V| × 1 (vocabulary aligned with gloveMatrix)
34+
# similarityMeasure (optional) String specifying similarity metric: "cosine", "euclidean"
35+
# threshold (optional) Double: threshold value above which tuples are considered duplicates
3636
# --------------------------------------------------------------------------------------
3737
#
3838
# OUTPUT:

src/main/python/systemds/operator/algorithm/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
from .builtin.dbscanApply import dbscanApply
5454
from .builtin.decisionTree import decisionTree
5555
from .builtin.decisionTreePredict import decisionTreePredict
56+
from .builtin.dedup import dedup
5657
from .builtin.deepWalk import deepWalk
5758
from .builtin.denialConstraints import denialConstraints
5859
from .builtin.differenceStatistics import differenceStatistics
@@ -251,6 +252,7 @@
251252
'dbscanApply',
252253
'decisionTree',
253254
'decisionTreePredict',
255+
'dedup',
254256
'deepWalk',
255257
'denialConstraints',
256258
'differenceStatistics',
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# -------------------------------------------------------------
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
#
20+
# -------------------------------------------------------------
21+
22+
# Autogenerated By : src/main/python/generator/generator.py
23+
# Autogenerated From : scripts/builtin/dedup.dml
24+
25+
from typing import Dict, Iterable
26+
27+
from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar
28+
from systemds.utils.consts import VALID_INPUT_TYPES
29+
30+
31+
def dedup(X: Frame,
32+
gloveMatrix: Matrix,
33+
vocab: Frame,
34+
**kwargs: Dict[str, VALID_INPUT_TYPES]):
35+
"""
36+
Builtin for deduplication using distributed representations (DRs) and
37+
locality-sensitive hashing (LSH) based blocking.
38+
39+
The function encodes each input tuple as a dense vector using pre-trained GloVe embeddings (simple averaging),
40+
groups semantically similar tuples via LSH into buckets, and compares only those pairs for deduplication.
41+
42+
43+
44+
45+
:param X: Input Frame[String] with n rows and d columns (raw tuples)
46+
:param gloveMatrix: Matrix[Double] of size |V| × e (pretrained GloVe embeddings) -> |V| number of words and e = embedding dimesnion
47+
:param vocab: Frame[String] of size |V| × 1 (vocabulary aligned with gloveMatrix)
48+
:param similarityMeasure: (optional) String specifying similarity metric: "cosine", "euclidean"
49+
:param threshold: (optional) Double: threshold value above which tuples are considered duplicates
50+
:return: Frame[String] with deduplicated tuples
51+
(first occurrence of each duplicate group is retained)
52+
:return: Frame[String] with all detected duplicates
53+
(i.e., tuples removed from the input)
54+
"""
55+
56+
params_dict = {'X': X, 'gloveMatrix': gloveMatrix, 'vocab': vocab}
57+
params_dict.update(kwargs)
58+
59+
vX_0 = Frame(X.sds_context, '')
60+
vX_1 = Frame(X.sds_context, '')
61+
output_nodes = [vX_0, vX_1, ]
62+
63+
op = MultiReturn(X.sds_context, 'dedup', output_nodes, named_input_nodes=params_dict)
64+
65+
vX_0._unnamed_input_nodes = [op]
66+
vX_1._unnamed_input_nodes = [op]
67+
68+
return op

src/main/python/systemds/operator/algorithm/builtin/glove.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def glove(input: Frame,
4242
distanceWeighting: bool,
4343
symmetric: bool):
4444
"""
45+
Computes the vector embeddings for words in a large text corpus.
4546
4647
4748

0 commit comments

Comments
 (0)