@@ -36,7 +36,7 @@ Sparkit-learn introduces two important distributed data format:
3636 rdd = sc.parallelize(data, 2 ) # each partition with 10 elements
3737 # ArrayRDD
3838 # each partition will contain blocks with 5 elements
39- X = ArrayRDD(rdd, block_size = 5 ) # 4 blocks, 2 in each partition
39+ X = ArrayRDD(rdd, bsize = 5 ) # 4 blocks, 2 in each partition
4040
4141 Basic operations:
4242
@@ -71,7 +71,7 @@ Sparkit-learn introduces two important distributed data format:
7171 # array([ 0, 1, 2, ... 17, 18, 19])
7272
7373 # pyspark.rdd operations will still work
74- X.numPartitions () # 2 - number of partitions
74+ X.getNumPartitions () # 2 - number of partitions
7575
7676 - **DictRDD: **
7777
@@ -84,19 +84,19 @@ Sparkit-learn introduces two important distributed data format:
8484 X = range (20 )
8585 y = range (2 ) * 10
8686 # PySpark RDD with 2 partitions
87- X_rdd = sc.parallelize(data_X , 2 ) # each partition with 10 elements
88- y_rdd = sc.parallelize(data_y , 2 ) # each partition with 10 elements
87+ X_rdd = sc.parallelize(X , 2 ) # each partition with 10 elements
88+ y_rdd = sc.parallelize(y , 2 ) # each partition with 10 elements
8989 zipped_rdd = X_rdd.zip(y_rdd) # zip the two rdd's together
9090 # DictRDD
9191 # each partition will contain blocks with 5 elements
92- Z = DictRDD(zipped_rdd, columns = (' X' , ' y' ), block_size = 5 ) # 4 blocks, 2/partition
92+ Z = DictRDD(zipped_rdd, columns = (' X' , ' y' ), bsize = 5 ) # 4 blocks, 2/partition
9393
9494 # or:
9595 import numpy as np
9696
9797 data = np.array([range (20 ), range (2 )* 10 ]).T
9898 rdd = sc.parallelize(data, 2 )
99- Z = DictRDD(rdd, columns = (' X' , ' y' ), block_size = 5 )
99+ Z = DictRDD(rdd, columns = (' X' , ' y' ), bsize = 5 )
100100
101101 Basic operations:
102102
0 commit comments