Merge pull request #397 from s22s/feature/pre-partition-datasources

vpipkt · web-flow · commit 549c308401c3 · 2019-10-21T10:40:37.000-04:00
Applying pre-partitioning to DataSources.
diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceRelation.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceRelation.scala
@@ -69,6 +69,9 @@ case class RasterSourceRelation(
     catalog.schema.fields.filter(f => !catalogTable.bandColumnNames.contains(f.name))
   }
 
+  protected def defaultNumPartitions: Int =
+    sqlContext.sparkSession.sessionState.conf.numShufflePartitions
+
   override def schema: StructType = {
     val tileSchema = schemaOf[ProjectedRasterTile]
     val paths = for {
@@ -84,10 +87,11 @@ case class RasterSourceRelation(
   override def buildScan(): RDD[Row] = {
     import sqlContext.implicits._
 
-    // The general transformaion is:
+    // The general transformation is:
     // input -> path -> src -> ref -> tile
     // Each step is broken down for readability
     val inputs: DataFrame = sqlContext.table(catalogTable.tableName)
+      .repartition(defaultNumPartitions)
 
     // Basically renames the input columns to have the '_path' suffix
     val pathsAliasing = for {
@@ -112,7 +116,7 @@ case class RasterSourceRelation(
 
     val df = if (lazyTiles) {
       // Expand RasterSource into multiple columns per band, and multiple rows per tile
-      // There's some unintentional fragililty here in that the structure of the expression
+      // There's some unintentional fragility here in that the structure of the expression
       // is expected to line up with our column structure here.
       val refs = RasterSourceToRasterRefs(subtileDims, bandIndexes, srcs: _*) as refColNames
 
diff --git a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala
@@ -33,6 +33,8 @@ import org.locationtech.rasterframes.util._
  * @since 8/24/18
  */
 trait CachedDatasetRelation extends ResourceCacheSupport { self: BaseRelation ⇒
+  protected def defaultNumPartitions: Int =
+    sqlContext.sparkSession.sessionState.conf.numShufflePartitions
   protected def cacheFile: HadoopPath
   protected def constructDataset: Dataset[Row]
 
diff --git a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelation.scala b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelation.scala
@@ -68,7 +68,9 @@ case class L8CatalogRelation(sqlContext: SQLContext, sceneListPath: HadoopPath)
       .select(schema.map(f ⇒ col(f.name)): _*)
       .orderBy(ACQUISITION_DATE.name, PATH.name, ROW.name)
       .distinct() // The scene file contains duplicates.
-      .repartition(8, col(PATH.name), col(ROW.name))
+      .repartition(defaultNumPartitions, col(PATH.name), col(ROW.name))
+
+
   }
 }
 
diff --git a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelation.scala b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelation.scala
@@ -64,7 +64,7 @@ case class MODISCatalogRelation(sqlContext: SQLContext, sceneList: HadoopPath)
         $"${GID.name}") ++ bandCols: _*
       )
       .orderBy(ACQUISITION_DATE.name, GID.name)
-      .repartition(8, col(GRANULE_ID.name))
+      .repartition(defaultNumPartitions, col(GRANULE_ID.name))
   }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,9 @@ case class L8CatalogRelation(sqlContext: SQLContext, sceneListPath: HadoopPath)`
`68`	`68`	`.select(schema.map(f ⇒ col(f.name)): _*)`
`69`	`69`	`.orderBy(ACQUISITION_DATE.name, PATH.name, ROW.name)`
`70`	`70`	`.distinct() // The scene file contains duplicates.`
`71`		`- .repartition(8, col(PATH.name), col(ROW.name))`
	`71`	`+ .repartition(defaultNumPartitions, col(PATH.name), col(ROW.name))`
	`72`	`+`
	`73`	`+`
`72`	`74`	`}`
`73`	`75`	`}`
`74`	`76`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ case class MODISCatalogRelation(sqlContext: SQLContext, sceneList: HadoopPath)`
`64`	`64`	`$"${GID.name}") ++ bandCols: _*`
`65`	`65`	`)`
`66`	`66`	`.orderBy(ACQUISITION_DATE.name, GID.name)`
`67`		`- .repartition(8, col(GRANULE_ID.name))`
	`67`	`+ .repartition(defaultNumPartitions, col(GRANULE_ID.name))`
`68`	`68`	`}`
`69`	`69`	`}`
`70`	`70`