Remove STACDataFrame searchLimit parameter

pomadchin · pomadchin · commit 1a743382453c · 2021-10-01T15:29:47.000-04:00
diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/stac/api/StacApiDataSource.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/stac/api/StacApiDataSource.scala
@@ -16,12 +16,11 @@ class StacApiDataSource extends TableProvider with DataSourceRegister {
   def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table =
     new StacApiTable()
 
-  override def shortName(): String = "stac-api"
+  def shortName(): String = StacApiDataSource.SHORT_NAME
 }
 
 object StacApiDataSource {
   final val SHORT_NAME = "stac-api"
   final val URI_PARAM = "uri"
   final val SEARCH_FILTERS_PARAM = "search-filters"
-  final val SEARCH_LIMIT_PARAM = "search-limit"
 }
diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/stac/api/StacApiPartition.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/stac/api/StacApiPartition.scala
@@ -7,13 +7,12 @@ import com.azavea.stac4s.StacItem
 import geotrellis.store.util.BlockingThreadPool
 import sttp.client3.asynchttpclient.cats.AsyncHttpClientCatsBackend
 import com.azavea.stac4s.api.client._
-import eu.timepit.refined.types.numeric.NonNegInt
 import cats.effect.IO
 import sttp.model.Uri
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
 
-case class StacApiPartition(uri: Uri, searchFilters: SearchFilters, searchLimit: Option[NonNegInt]) extends InputPartition
+case class StacApiPartition(uri: Uri, searchFilters: SearchFilters) extends InputPartition
 
 class StacApiPartitionReaderFactory extends PartitionReaderFactory {
   override def createReader(partition: InputPartition): PartitionReader[InternalRow] = {
@@ -25,24 +24,17 @@ class StacApiPartitionReaderFactory extends PartitionReaderFactory {
 }
 
 class StacApiPartitionReader(partition: StacApiPartition) extends PartitionReader[InternalRow] {
-  lazy val partitionValues: Iterator[StacItem] = {
-    implicit val cs = IO.contextShift(BlockingThreadPool.executionContext)
-    AsyncHttpClientCatsBackend
-      .resource[IO]()
-      .use { backend =>
-        SttpStacClient(backend, partition.uri)
-          .search(partition.searchFilters)
-          .take(partition.searchLimit.map(_.value))
-          .compile
-          .toList
-      }
-      .map(_.toIterator)
-      .unsafeRunSync()
-  }
+
+  @transient private implicit lazy val cs = IO.contextShift(BlockingThreadPool.executionContext)
+  @transient private lazy val backend = AsyncHttpClientCatsBackend[IO]().unsafeRunSync()
+  @transient private lazy val partitionValues: Iterator[StacItem] =
+    SttpStacClient(backend, partition.uri)
+      .search(partition.searchFilters)
+      .toIterator(_.unsafeRunSync())
 
   def next: Boolean = partitionValues.hasNext
 
   def get: InternalRow = partitionValues.next.toInternalRow
 
-  def close(): Unit = { }
+  def close(): Unit = backend.close().unsafeRunSync()
 }
diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/stac/api/StacApiScanBuilder.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/stac/api/StacApiScanBuilder.scala
@@ -8,12 +8,12 @@ import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionRead
 import org.apache.spark.sql.types.StructType
 import sttp.model.Uri
 
-class StacApiScanBuilder(uri: Uri, searchFilters: SearchFilters, searchLimit: Option[NonNegInt]) extends ScanBuilder {
-  override def build(): Scan = new StacApiBatchScan(uri, searchFilters, searchLimit)
+class StacApiScanBuilder(uri: Uri, searchFilters: SearchFilters) extends ScanBuilder {
+  def build(): Scan = new StacApiBatchScan(uri, searchFilters)
 }
 
 /** Batch Reading Support. The schema is repeated here as it can change after column pruning, etc. */
-class StacApiBatchScan(uri: Uri, searchFilters: SearchFilters, searchLimit: Option[NonNegInt]) extends Scan with Batch {
+class StacApiBatchScan(uri: Uri, searchFilters: SearchFilters) extends Scan with Batch {
   def readSchema(): StructType = stacItemEncoder.schema
 
   override def toBatch: Batch = this
@@ -23,6 +23,6 @@ class StacApiBatchScan(uri: Uri, searchFilters: SearchFilters, searchLimit: Opti
    * To perform a distributed load, we'd need to know some internals about how the next page token is computed.
    * This can be a good idea for the STAC Spec extension.
    * */
-  def planInputPartitions(): Array[InputPartition] = Array(StacApiPartition(uri, searchFilters, searchLimit))
+  def planInputPartitions(): Array[InputPartition] = Array(StacApiPartition(uri, searchFilters))
   def createReaderFactory(): PartitionReaderFactory = new StacApiPartitionReaderFactory()
 }
diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/stac/api/StacApiTable.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/stac/api/StacApiTable.scala
@@ -7,8 +7,8 @@ import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapabil
 import org.apache.spark.sql.connector.read.ScanBuilder
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
-import org.locationtech.rasterframes.datasource.stac.api.StacApiDataSource.{SEARCH_LIMIT_PARAM, SEARCH_FILTERS_PARAM, URI_PARAM}
-import org.locationtech.rasterframes.datasource.{intParam, jsonParam, uriParam}
+import org.locationtech.rasterframes.datasource.stac.api.StacApiDataSource.{SEARCH_FILTERS_PARAM, URI_PARAM}
+import org.locationtech.rasterframes.datasource.{jsonParam, uriParam}
 import sttp.model.Uri
 
 import scala.collection.JavaConverters._
@@ -24,7 +24,7 @@ class StacApiTable extends Table with SupportsRead {
   def capabilities(): util.Set[TableCapability] = Set(TableCapability.BATCH_READ).asJava
 
   def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder =
-    new StacApiScanBuilder(options.uri, options.searchFilters, options.searchLimit)
+    new StacApiScanBuilder(options.uri, options.searchFilters)
 }
 
 object StacApiTable {
@@ -35,7 +35,5 @@ object StacApiTable {
       jsonParam(SEARCH_FILTERS_PARAM, options)
         .flatMap(_.as[SearchFilters].toOption)
         .getOrElse(SearchFilters(limit = NonNegInt.from(30).toOption))
-
-    def searchLimit: Option[NonNegInt] = intParam(SEARCH_LIMIT_PARAM, options).flatMap(NonNegInt.from(_).toOption)
   }
 }
diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/stac/api/package.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/stac/api/package.scala
@@ -1,9 +1,11 @@
 package org.locationtech.rasterframes.datasource.stac
 
+import cats.Monad
+import cats.syntax.functor._
 import com.azavea.stac4s.api.client.SearchFilters
 import org.apache.spark.sql.{DataFrame, DataFrameReader}
 import io.circe.syntax._
-import fs2.Stream
+import fs2.{Pull, Stream}
 import shapeless.tag
 import shapeless.tag.@@
 import org.apache.spark.sql.SparkSession
@@ -17,6 +19,7 @@ package object api {
 
   implicit class StacApiDataFrameReaderOps(val reader: StacApiDataFrameReader) extends AnyVal {
     def loadStac: StacApiDataFrame = tag[StacApiDataFrameTag][DataFrame](reader.load)
+    def loadStac(limit: Int): StacApiDataFrame = tag[StacApiDataFrameTag][DataFrame](reader.load.limit(limit))
   }
 
   implicit class StacApiDataFrameOps(val df: StacApiDataFrame) extends AnyVal {
@@ -38,7 +41,27 @@ package object api {
   }
 
   implicit class Fs2StreamOps[F[_], T](val self: Stream[F, T]) {
-    def take(n: Option[Int]): Stream[F, T] = n.fold(self)(self.take(_))
+    /** Unsafe API to interop with the Spark API. */
+    def toIterator(run: F[Option[(T, fs2.Stream[F, T])]] => Option[(T, fs2.Stream[F, T])])
+                  (implicit monad: Monad[F], compiler: Stream.Compiler[F, F]): Iterator[T] = new Iterator[T] {
+      private var head = self
+      private def nextF: F[Option[(T, fs2.Stream[F, T])]] =
+        head
+          .pull.uncons1
+          .flatMap(Pull.output1)
+          .stream
+          .compile
+          .last
+          .map(_.flatten)
+
+      def hasNext(): Boolean = run(nextF).nonEmpty
+
+      def next(): T = {
+        val (item, tail) = run(nextF).get
+        this.head = tail
+        item
+      }
+    }
   }
 
   implicit class DataFrameReaderOps(val self: DataFrameReader) extends AnyVal {
@@ -48,12 +71,11 @@ package object api {
 
   implicit class DataFrameReaderStacApiOps(val reader: DataFrameReader) extends AnyVal {
     def stacApi(): StacApiDataFrameReader = tag[StacApiDataFrameTag][DataFrameReader](reader.format(StacApiDataSource.SHORT_NAME))
-    def stacApi(uri: String, filters: SearchFilters = SearchFilters(), searchLimit: Option[Int] = None): StacApiDataFrameReader =
+    def stacApi(uri: String, filters: SearchFilters = SearchFilters()): StacApiDataFrameReader =
       tag[StacApiDataFrameTag][DataFrameReader](
         stacApi()
           .option(StacApiDataSource.URI_PARAM, uri)
           .option(StacApiDataSource.SEARCH_FILTERS_PARAM, filters.asJson.noSpaces)
-          .option(StacApiDataSource.SEARCH_LIMIT_PARAM, searchLimit)
       )
   }
 }
diff --git a/datasource/src/test/scala/org/locationtech/rasterframes/datasource/stac/api/StacApiDataSourceTest.scala b/datasource/src/test/scala/org/locationtech/rasterframes/datasource/stac/api/StacApiDataSourceTest.scala
@@ -25,9 +25,7 @@ import org.locationtech.rasterframes.datasource.raster._
 import org.locationtech.rasterframes.datasource.stac.api.encoders._
 import com.azavea.stac4s.StacItem
 import com.azavea.stac4s.api.client.{SearchFilters, SttpStacClient}
-import cats.syntax.option._
 import cats.effect.IO
-import eu.timepit.refined.auto._
 import geotrellis.store.util.BlockingThreadPool
 import org.apache.spark.sql.functions.explode
 import org.locationtech.rasterframes.TestEnvironment
@@ -45,9 +43,10 @@ class StacApiDataSourceTest extends TestEnvironment { self =>
           .read
           .stacApi(
             "https://franklin.nasa-hsi.azavea.com/",
-            filters = SearchFilters(items = List("aviris-l1-cogs_f130329t01p00r06_sc01")),
-            searchLimit = Some(1)
-          ).load
+            filters = SearchFilters(items = List("aviris-l1-cogs_f130329t01p00r06_sc01"))
+          )
+          .load
+          .limit(1)
 
       results.rdd.partitions.length shouldBe 1
       results.count() shouldBe 1L
@@ -78,9 +77,10 @@ class StacApiDataSourceTest extends TestEnvironment { self =>
           .read
           .stacApi(
             "https://franklin.nasa-hsi.azavea.com/",
-            filters = SearchFilters(items = List("aviris-l1-cogs_f130329t01p00r06_sc01")),
-            searchLimit = Some(1)
-          ).load
+            filters = SearchFilters(items = List("aviris-l1-cogs_f130329t01p00r06_sc01"))
+          )
+          .load
+          .limit(1)
 
       results.rdd.partitions.length shouldBe 1
 
@@ -118,10 +118,9 @@ class StacApiDataSourceTest extends TestEnvironment { self =>
           .read
           .stacApi(
             "https://franklin.nasa-hsi.azavea.com/",
-            filters = SearchFilters(items = List("aviris-l1-cogs_f130329t01p00r06_sc01")),
-            searchLimit = Some(1)
+            filters = SearchFilters(items = List("aviris-l1-cogs_f130329t01p00r06_sc01"))
           )
-          .loadStac
+          .loadStac(limit = 1) // to preserve the STAC DataFrame type
 
       val assets =
         items
@@ -149,7 +148,7 @@ class StacApiDataSourceTest extends TestEnvironment { self =>
     it("should read from Astraea Earth service") {
       import spark.implicits._
 
-      val results = spark.read.stacApi("https://eod-catalog-svc-prod.astraea.earth/", searchLimit = Some(1)).load
+      val results = spark.read.stacApi("https://eod-catalog-svc-prod.astraea.earth/").load.limit(1)
 
       // results.printSchema()
 
@@ -178,8 +177,9 @@ class StacApiDataSourceTest extends TestEnvironment { self =>
       val items =
         spark
           .read
-          .stacApi("https://eod-catalog-svc-prod.astraea.earth/", searchLimit = 1.some)
+          .stacApi("https://eod-catalog-svc-prod.astraea.earth/")
           .load
+          .limit(1)
 
       println(items.collect().toList.length)
 
@@ -199,7 +199,11 @@ class StacApiDataSourceTest extends TestEnvironment { self =>
 
     ignore("should fetch rasters from the Datacube STAC API service") {
       import spark.implicits._
-      val items = spark.read.stacApi("https://datacube.services.geo.ca/api",  filters = SearchFilters(collections=List("markham")), searchLimit = Some(1)).load
+      val items = spark
+        .read
+        .stacApi("https://datacube.services.geo.ca/api",  filters = SearchFilters(collections=List("markham")))
+        .load
+        .limit(1)
 
       println(items.collect().toList.length)
 
diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py
@@ -255,20 +255,17 @@ def temp_name():
 def _stac_api_reader(
         df_reader: DataFrameReader,
         uri: str,
-        filters: dict = None,
-        search_limit: Optional[int] = None) -> DataFrame:
+        filters: dict = None) -> DataFrame:
     """
     uri - STAC API uri
     filters - a STAC API Search filters dict (bbox, datetime, intersects, collections, items, limit, query, next)
-    search_limit - search results convenient limit method
     """
     import json
 
     return df_reader \
         .format("stac-api") \
         .option("uri", uri) \
         .option("search-filters", json.dumps(filters)) \
-        .option("search-limit", search_limit) \
         .load()
 
 def _geotiff_writer(
diff --git a/rf-notebook/src/main/notebooks/STAC API Example.ipynb b/rf-notebook/src/main/notebooks/STAC API Example.ipynb
@@ -75,7 +75,7 @@
     "# due to the collection size and query parameters\n",
     "# it makes sense to limit the amount of items retrieved from the STAC API\n",
     "uri = 'https://earth-search.aws.element84.com/v0'\n",
-    "df = spark.read.stacapi(uri, {'collections': ['landsat-8-l1-c1']}, search_limit=100)"
+    "df = spark.read.stacapi(uri, {'collections': ['landsat-8-l1-c1']}).limit(100)"
    ]
   },
   {

Original file line number	Diff line number	Diff line change
`@@ -16,12 +16,11 @@ class StacApiDataSource extends TableProvider with DataSourceRegister {`
`16`	`16`	`def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table =`
`17`	`17`	`new StacApiTable()`
`18`	`18`
`19`		`- override def shortName(): String = "stac-api"`
	`19`	`+ def shortName(): String = StacApiDataSource.SHORT_NAME`
`20`	`20`	`}`
`21`	`21`
`22`	`22`	`object StacApiDataSource {`
`23`	`23`	`final val SHORT_NAME = "stac-api"`
`24`	`24`	`final val URI_PARAM = "uri"`
`25`	`25`	`final val SEARCH_FILTERS_PARAM = "search-filters"`
`26`		`- final val SEARCH_LIMIT_PARAM = "search-limit"`
`27`	`26`	`}`
Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@`
`75`	`75`	`"# due to the collection size and query parameters\n",`
`76`	`76`	`"# it makes sense to limit the amount of items retrieved from the STAC API\n",`
`77`	`77`	`"uri = 'https://earth-search.aws.element84.com/v0'\n",`
`78`		`- "df = spark.read.stacapi(uri, {'collections': ['landsat-8-l1-c1']}, search_limit=100)"`
	`78`	`+ "df = spark.read.stacapi(uri, {'collections': ['landsat-8-l1-c1']}).limit(100)"`
`79`	`79`	`]`
`80`	`80`	`},`
`81`	`81`	`{`