Skip to content

Commit f575616

Browse files
huaxingaoFelix Cheung
authored andcommitted
[SPARK-25859][ML] add scala/java/python example and doc for PrefixSpan
## What changes were proposed in this pull request? add scala/java/python example and doc for PrefixSpan in branch 2.4 ## How was this patch tested? Manually tested Author: Huaxin Gao <[email protected]> Closes apache#22863 from huaxingao/mydocbranch.
1 parent 313a1f0 commit f575616

File tree

4 files changed

+224
-0
lines changed

4 files changed

+224
-0
lines changed

docs/ml-frequent-pattern-mining.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,3 +85,49 @@ Refer to the [R API docs](api/R/spark.fpGrowth.html) for more details.
8585
</div>
8686

8787
</div>
88+
89+
## PrefixSpan
90+
91+
PrefixSpan is a sequential pattern mining algorithm described in
92+
[Pei et al., Mining Sequential Patterns by Pattern-Growth: The
93+
PrefixSpan Approach](http://dx.doi.org/10.1109%2FTKDE.2004.77). We refer
94+
the reader to the referenced paper for formalizing the sequential
95+
pattern mining problem.
96+
97+
`spark.ml`'s PrefixSpan implementation takes the following parameters:
98+
99+
* `minSupport`: the minimum support required to be considered a frequent
100+
sequential pattern.
101+
* `maxPatternLength`: the maximum length of a frequent sequential
102+
pattern. Any frequent pattern exceeding this length will not be
103+
included in the results.
104+
* `maxLocalProjDBSize`: the maximum number of items allowed in a
105+
prefix-projected database before local iterative processing of the
106+
projected database begins. This parameter should be tuned with respect
107+
to the size of your executors.
108+
* `sequenceCol`: the name of the sequence column in dataset (default "sequence"), rows with
109+
nulls in this column are ignored.
110+
111+
**Examples**
112+
113+
<div class="codetabs">
114+
115+
<div data-lang="scala" markdown="1">
116+
Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.fpm.PrefixSpan) for more details.
117+
118+
{% include_example scala/org/apache/spark/examples/ml/PrefixSpanExample.scala %}
119+
</div>
120+
121+
<div data-lang="java" markdown="1">
122+
Refer to the [Java API docs](api/java/org/apache/spark/ml/fpm/PrefixSpan.html) for more details.
123+
124+
{% include_example java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java %}
125+
</div>
126+
127+
<div data-lang="python" markdown="1">
128+
Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.fpm.PrefixSpan) for more details.
129+
130+
{% include_example python/ml/prefixspan_example.py %}
131+
</div>
132+
133+
</div>
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.ml;
19+
20+
// $example on$
21+
22+
import org.apache.spark.ml.fpm.PrefixSpan;
23+
import org.apache.spark.sql.Dataset;
24+
import org.apache.spark.sql.Row;
25+
import org.apache.spark.sql.RowFactory;
26+
import org.apache.spark.sql.SparkSession;
27+
import org.apache.spark.sql.types.*;
28+
29+
import java.util.Arrays;
30+
import java.util.List;
31+
// $example off$
32+
33+
/**
34+
* An example demonstrating PrefixSpan.
35+
* Run with
36+
* <pre>
37+
* bin/run-example ml.JavaPrefixSpanExample
38+
* </pre>
39+
*/
40+
public class JavaPrefixSpanExample {
41+
public static void main(String[] args) {
42+
SparkSession spark = SparkSession
43+
.builder()
44+
.appName("JavaPrefixSpanExample")
45+
.getOrCreate();
46+
47+
// $example on$
48+
List<Row> data = Arrays.asList(
49+
RowFactory.create(Arrays.asList(Arrays.asList(1, 2), Arrays.asList(3))),
50+
RowFactory.create(Arrays.asList(Arrays.asList(1), Arrays.asList(3, 2), Arrays.asList(1,2))),
51+
RowFactory.create(Arrays.asList(Arrays.asList(1, 2), Arrays.asList(5))),
52+
RowFactory.create(Arrays.asList(Arrays.asList(6)))
53+
);
54+
StructType schema = new StructType(new StructField[]{ new StructField(
55+
"sequence", new ArrayType(new ArrayType(DataTypes.IntegerType, true), true),
56+
false, Metadata.empty())
57+
});
58+
Dataset<Row> sequenceDF = spark.createDataFrame(data, schema);
59+
60+
PrefixSpan prefixSpan = new PrefixSpan().setMinSupport(0.5).setMaxPatternLength(5);
61+
62+
// Finding frequent sequential patterns
63+
prefixSpan.findFrequentSequentialPatterns(sequenceDF).show();
64+
// $example off$
65+
66+
spark.stop();
67+
}
68+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
"""
19+
An example demonstrating PrefixSpan.
20+
Run with:
21+
bin/spark-submit examples/src/main/python/ml/prefixspan_example.py
22+
"""
23+
# $example on$
24+
from pyspark.ml.fpm import PrefixSpan
25+
# $example off$
26+
from pyspark.sql import Row, SparkSession
27+
28+
if __name__ == "__main__":
29+
spark = SparkSession\
30+
.builder\
31+
.appName("PrefixSpanExample")\
32+
.getOrCreate()
33+
sc = spark.sparkContext
34+
35+
# $example on$
36+
df = sc.parallelize([Row(sequence=[[1, 2], [3]]),
37+
Row(sequence=[[1], [3, 2], [1, 2]]),
38+
Row(sequence=[[1, 2], [5]]),
39+
Row(sequence=[[6]])]).toDF()
40+
41+
prefixSpan = PrefixSpan(minSupport=0.5, maxPatternLength=5,
42+
maxLocalProjDBSize=32000000)
43+
44+
# Find frequent sequential patterns.
45+
prefixSpan.findFrequentSequentialPatterns(df).show()
46+
# $example off$
47+
48+
spark.stop()
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.ml
19+
20+
// scalastyle:off println
21+
22+
// $example on$
23+
import org.apache.spark.ml.fpm.PrefixSpan
24+
// $example off$
25+
import org.apache.spark.sql.SparkSession
26+
27+
/**
28+
* An example demonstrating PrefixSpan.
29+
* Run with
30+
* {{{
31+
* bin/run-example ml.PrefixSpanExample
32+
* }}}
33+
*/
34+
object PrefixSpanExample {
35+
36+
def main(args: Array[String]): Unit = {
37+
val spark = SparkSession
38+
.builder
39+
.appName(s"${this.getClass.getSimpleName}")
40+
.getOrCreate()
41+
import spark.implicits._
42+
43+
// $example on$
44+
val smallTestData = Seq(
45+
Seq(Seq(1, 2), Seq(3)),
46+
Seq(Seq(1), Seq(3, 2), Seq(1, 2)),
47+
Seq(Seq(1, 2), Seq(5)),
48+
Seq(Seq(6)))
49+
50+
val df = smallTestData.toDF("sequence")
51+
val result = new PrefixSpan()
52+
.setMinSupport(0.5)
53+
.setMaxPatternLength(5)
54+
.setMaxLocalProjDBSize(32000000)
55+
.findFrequentSequentialPatterns(df)
56+
.show()
57+
// $example off$
58+
59+
spark.stop()
60+
}
61+
}
62+
// scalastyle:on println

0 commit comments

Comments
 (0)