Skip to content

Commit 8da23d1

Browse files
committed
Merge remote-tracking branch 'lt/develop' into docs/fix-250-89
Signed-off-by: Jason T. Brown <[email protected]>
2 parents 48e6366 + 3878cf6 commit 8da23d1

File tree

10 files changed

+106
-47
lines changed

10 files changed

+106
-47
lines changed

.circleci/config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,4 +204,4 @@ workflows:
204204
- it
205205
- itWithoutGdal
206206
- docs
207-
- staticAnalysis
207+
# - staticAnalysis

core/src/main/scala/org/locationtech/rasterframes/ref/RasterSource.scala

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,13 +129,18 @@ object RasterSource extends LazyLogging {
129129
} else false
130130

131131
/** Extractor for determining if a scheme indicates GDAL preference. */
132-
def unapply(source: URI): Boolean =
133-
gdalOnly(source) || ((preferGdal || source.getScheme.startsWith("gdal")) && GDALRasterSource.hasGDAL)
132+
def unapply(source: URI): Boolean = {
133+
lazy val schemeIsGdal = Option(source.getScheme())
134+
.exists(_.startsWith("gdal"))
135+
136+
gdalOnly(source) || ((preferGdal || schemeIsGdal) && GDALRasterSource.hasGDAL)
137+
}
134138
}
135139

136140
object IsDefaultGeoTiff {
137141
def unapply(source: URI): Boolean = source.getScheme match {
138-
case "file" | "http" | "https" | "s3" | "" => true
142+
case "file" | "http" | "https" | "s3" => true
143+
case null | "" true
139144
case _ => false
140145
}
141146
}

core/src/test/scala/org/locationtech/rasterframes/ref/RasterSourceSpec.scala

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,13 @@ class RasterSourceSpec extends TestEnvironment with TestData {
106106
val src = RasterSource(localSrc)
107107
assert(!src.extent.isEmpty)
108108
}
109+
it("should interpret no scheme as file://"){
110+
val localSrc = geotiffDir.resolve("LC08_B7_Memphis_COG.tiff").toString()
111+
val schemelessUri = new URI(localSrc)
112+
schemelessUri.getScheme should be (null)
113+
val src = RasterSource(schemelessUri)
114+
assert(!src.extent.isEmpty)
115+
}
109116
}
110117

111118
if(GDALRasterSource.hasGDAL) {
@@ -132,6 +139,15 @@ class RasterSourceSpec extends TestEnvironment with TestData {
132139

133140
gdal.bandCount should be (3)
134141
}
142+
143+
it("should interpret no scheme as file://") {
144+
val localSrc = geotiffDir.resolve("LC08_B7_Memphis_COG.tiff").toString()
145+
val schemelessUri = new URI(localSrc)
146+
val gdal = GDALRasterSource(schemelessUri)
147+
val jvm = JVMGeoTiffRasterSource(schemelessUri)
148+
gdal.extent should be (jvm.extent)
149+
gdal.cellSize should be(jvm.cellSize)
150+
}
135151
}
136152
}
137153

pyrasterframes/src/main/python/docs/concepts.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ A "NoData" (or N/A) value is a specifically identified value for a cell type use
4545

4646
A scene (or granule) is a discrete instance of EO @ref:[raster data](concepts.md#raster) with a specific extent (region), date-time, and map projection (or CRS).
4747

48+
## Band
49+
50+
A @ref:[scene](concepts.md#scene) frequently defines many different measurements captured a the same date-time, over the same extent, and meant to be processed together. These different measurements are referred to as bands. The name comes from the varying bandwidths of light and electromagnetic radiation measured in many EO datasets.
51+
4852
## Coordinate Reference System (CRS)
4953

5054
A [coordinate reference system (or spatial reference system)][CRS] is a set of mathematical constructs used to translate locations on the three-dimensional surface of the earth to the two dimensional raster grid. A CRS typically accompanies any EO data so it can be precisely located.
@@ -57,10 +61,10 @@ An extent (or bounding box) is a rectangular region specifying the geospatial co
5761

5862
A tile (sometimes called a "chip") is a rectangular subset of a @ref:[scene](concepts.md#scene). As a scene is a raster, a tile is also a raster. A tile can conceptually be thought of as a two-dimensional array.
5963

60-
Some EO data has many bands or channels. Tiles in this context are conceptually a three-dimensional array, with the extra dimension representing the bands.
64+
Some EO data has many @ref:[bands](concepts.md#band) or channels. Within RasterFrames, this third dimension is handled across columns of the DataFrame, such that the tiles within DataFrames are all two-dimensional arrays.
6165

6266
Tiles are often square and the dimensions are some power of two, for example 256 by 256.
6367

64-
The tile is the primary discretization unit used in RasterFrames. Each band of a scene is in a separate column. The scene's overall @ref:[extent](concepts.md#extent) is carved up into smaller extents for each tile. Each row of the DataFrame contains a two-dimensional tile per band column.
68+
The tile is the primary discretization unit used in RasterFrames. The scene's overall @ref:[extent](concepts.md#extent) is carved up into smaller extents and spread across rows.
6569

66-
[CRS]: https://en.wikipedia.org/wiki/Spatial_reference_system
70+
[CRS]: https://en.wikipedia.org/wiki/Spatial_reference_system

pyrasterframes/src/main/python/docs/raster-catalogs.pymd

Lines changed: 37 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
# Raster Catalogs
22

3-
While much interesting processing can be done on a @ref:[single raster file](raster-read.md#single-raster), RasterFrames shines when _Catalogs_ of raster data are to be processed. In its simplest form, a _Catalog_ is a listing of @ref:[URLs referencing raster files](raster-read.md#uri-formats). This listing can be manifested as a DataFrame, CSV file or CSV string. The _Catalog_ is input into the `raster` DataSource, described in the next page.
3+
While much interesting processing can be done on a @ref:[single raster file](raster-read.md#single-raster), RasterFrames shines when _catalogs_ of raster data are to be processed. In its simplest form, a _catalog_ is a list of @ref:[URLs referencing raster files](raster-read.md#uri-formats). This list can be a Spark DataFrame, Pandas DataFrame, CSV file or CSV string. The _catalog_ is input into the `raster` DataSource, described in the @ref:[next page](raster-read.pymd), which creates tiles from the rasters at the referenced URLs.
44

5-
A _Catalog_ can have one or two dimensions:
5+
A _catalog_ can have one or two dimensions:
66

7-
* One-D: A single column containing one or many URLs across the rows. All referenced rasters represent the same content type. For example, a column of URLs to Landsat 8 NIR rasters covering Europe. Each row represents different places and times.
8-
* Two-D: Many columns containing raster URLs. Each column references the same content type, and each row represents the same place and time. For example, red-, green-, and blue-band columns for scenes covering Europe. Each row represents a single spatiotemporal location (or scene) with the same dimensions, extent, [_CRS_][CRS], etc across the row.
7+
* One-D: A single column contains raster URLs across the rows. All referenced rasters represent the same @ref:[band](concepts.md#band). For example, a column of URLs to Landsat 8 near-infrared rasters covering Europe. Each row represents different places and times.
8+
* Two-D: Many columns containing raster URLs. Each column references the same band, and each row represents the same place and time. For example, red-, green-, and blue-band columns for scenes covering Europe. Each row represents a single @ref:[scene](concepts.md#scene) with the same resolution, extent, [_CRS_][CRS], etc across the row.
99

1010
## Creating a Catalog
1111

12-
This section will provide some examples of creating your own _Catalogs_, as well as introduce some experimental _Catalogs_ built into RasterFrames. Reading raster data represented by a _Catalog_ is covered in more detail in the @ref:[next page](raster-read.md).
12+
This section will provide some examples of creating your own _catalogs_, as well as introduce some experimental _catalogs_ built into RasterFrames. Reading raster data represented by a _catalog_ is covered in more detail in the @ref:[next page](raster-read.md).
1313

1414
```python, echo=False
1515
from IPython.display import display
@@ -25,30 +25,37 @@ A single URL is the simplest form of a catalog.
2525

2626
```python
2727
from pyspark.sql import Row
28-
my_cat = "https://modis-pds.s3.amazonaws.com/MCD43A4.006/04/09/2018185/MCD43A4.A2018185.h04v09.006.2018194032851_B01.TIF"
29-
# or
30-
my_cat_df = spark.createDataFrame([Row(B01=my_cat)])
28+
29+
file_uri = "/data/raster/myfile.tif"
30+
# Pandas DF
31+
my_cat = pd.DataFrame({'B01': [file_uri]})
32+
33+
# equivalent Spark DF
34+
my_cat = spark.createDataFrame([Row(B01=file_uri)])
35+
36+
#equivalent CSV string
37+
my_cat = "B01\n{}".format(file_uri)
3138
```
3239

33-
A single column represents the same content type with different observations along the rows. In this example it is band 1 of MODIS, which is visible red. In the example the location of the images is the same (h04v09) but the dates differ.
40+
A single column represents the same content type with different observations along the rows. In this example it is band 1 of MODIS surface reflectance, which is visible red. In the example the location of the images is the same, indicated by the granule identifier `h04v09`, but the dates differ: 2018185 (July 4, 2018) and 2018188 (July 7, 2018).
3441

3542
```python
3643
scene1_B01 = "https://modis-pds.s3.amazonaws.com/MCD43A4.006/04/09/2018185/MCD43A4.A2018185.h04v09.006.2018194032851_B01.TIF"
3744
scene2_B01 = "https://modis-pds.s3.amazonaws.com/MCD43A4.006/04/09/2018188/MCD43A4.A2018188.h04v09.006.2018198232008_B01.TIF"
3845

39-
# As CSV string
40-
my_cat = '\n'.join(['B01', scene1_B01, scene2_B01])
41-
# or
42-
my_cat_df = spark.createDataFrame([Row(B01=scene1_B01), Row(B01=scene2_B01)])
43-
my_cat_df.printSchema()
44-
# or
45-
pandas_cat = pd.DataFrame([{'B01': scene1_B01}, {'B01': scene2_B01}])
46-
my_cat = spark.createDataFrame(pandas_cat)
46+
# a pandas DF
47+
one_d_cat = pd.DataFrame({'B01': [scene1_B01, scene2_B01]})
48+
49+
# equivalent spark DF
50+
one_d_cat = spark.createDataFrame([Row(B01=scene1_B01), Row(B01=scene2_B01)])
51+
52+
# equivalent CSV string
53+
one_d_cat = '\n'.join(['B01', scene1_B01, scene2_B01])
4754
```
4855

4956
### Two-D
5057

51-
Example of a multiple columns representing multiple content types (bands) across multiple scenes. In each row, the scene is the same. The first column is band 1 and the second is band 2, near infrared.
58+
Example of a multiple columns representing multiple content types (bands) across multiple scenes. In each row, the scene is the same: granule id `h04v09` on July 4 or July 7, 2018. The first column is band 1, red, and the second is band 2, near infrared.
5259

5360
```python
5461
scene1_B01 = "https://modis-pds.s3.amazonaws.com/MCD43A4.006/04/09/2018185/MCD43A4.A2018185.h04v09.006.2018194032851_B01.TIF"
@@ -68,36 +75,37 @@ my_cat_df.printSchema()
6875

6976
## Using External Catalogs
7077

71-
The simplest example of an external _Catalog_ is a DataFrame (or a transformation of) in one of the formats above. Here's an extended example of reading an external CSV file of MODIS scenes and transforming it into a _Catalog_. The metadata describing the content of each URL is an important aspect of processing raster data. This example includes some minimal metadata.
78+
The concept of a _catalog_ is much more powerful when we consider examples beyond constructing the DataFrame, and instead read the data from an external source. Here's an extended example of reading an cloud-hosted CSV file containing MODIS scene metadata and transforming it into a _catalog_. The metadata describing the content of each URL is an important aspect of processing raster data.
7279

7380
```python
7481
from pyspark import SparkFiles
7582
from pyspark.sql import functions as F
7683

7784
spark.sparkContext.addFile("https://modis-pds.s3.amazonaws.com/MCD43A4.006/2018-07-04_scenes.txt")
7885

79-
# The scenes list file has index URIs in the download_url column, for example:
80-
# https://modis-pds.s3.amazonaws.com/MCD43A4.006/04/09/2018185/index.html
81-
# Image URIs take the form:
82-
# https://modis-pds.s3.amazonaws.com/MCD43A4.006/04/09/2018185/MCD43A4.A2018185.h04v09.006.2018194032851_B01.TIF
83-
84-
modis_catalog = spark.read \
86+
scene_list = spark.read \
8587
.format("csv") \
8688
.option("header", "true") \
87-
.load(SparkFiles.get("2018-07-04_scenes.txt")) \
89+
.load(SparkFiles.get("2018-07-04_scenes.txt"))
90+
scene_list.head(3)
91+
```
92+
93+
Observe the scenes list file has URIs to `index.html` files in the download_url column. The image URI's are in the same directory. The filenames are of the form `${gid}_B${band}.TIF`. The next code chunk builds these URIs, which completes our catalog.
94+
95+
```python
96+
modis_catalog = scene_list \
8897
.withColumn('base_url',
8998
F.concat(F.regexp_replace('download_url', 'index.html$', ''), 'gid',)
9099
) \
91100
.withColumn('B01' , F.concat('base_url', F.lit("_B01.TIF"))) \
92101
.withColumn('B02' , F.concat('base_url', F.lit("_B02.TIF"))) \
93102
.withColumn('B03' , F.concat('base_url', F.lit("_B03.TIF")))
94-
# ... and so on.
95-
modis_catalog.printSchema()
103+
modis_catalog.head(3)
96104
```
97105

98106
## Using Built-in Experimental Catalogs
99107

100-
RasterFrames comes with two experimental catalogs over the AWS PDS [Landsat 8][Landsat] and [MODIS][MODIS] repositories. They are created by downloading the latest scene lists and transforming as in the prior example.
108+
RasterFrames comes with two experimental catalogs over the AWS PDS [Landsat 8][Landsat] and [MODIS][MODIS] repositories. They are created by downloading the latest scene lists and building up the appropriate band URI columns as in the prior example.
101109

102110
> Note: The first time you run these may take some time, as the catalogs are large. However, they are cached and subsequent invocations should be faster.
103111

pyrasterframes/src/main/python/docs/raster-read.pymd

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,11 +148,11 @@ In the initial examples on this page, you may have noticed that the realized (no
148148

149149
## Multiband Rasters
150150

151-
A multiband raster represents a three dimensional numeric array. The first two dimensions are spatial, and the third dimension is typically designated as different bands. The bands could represent intensity of different wavelengths of light (or other electromagnetic radiation), or they could represent other phenomena such as measurement time, quality indications, or additional measurements.
151+
A multiband raster represents a three dimensional numeric array. The first two dimensions are spatial, and the third dimension is typically designated as different @ref:[bands](concepts.md#band). The bands could represent intensity of different wavelengths of light (or other electromagnetic radiation), or they could represent other phenomena such as measurement time, quality indications, or additional measurements.
152152

153153
Multiband rasters files have a strictly ordered set of bands, which are typically indexed from 1. Some files have metadata tags associated with each band. Some files have a color interpetation metadata tag indicating how to interpret the bands.
154154

155-
When reading a multiband raster or a _Catalog_ describing multiband rasters, you will need to know ahead of time which bands you want to read. You will specify the bands to read, **indexed from zero**, passing a list of integers into the `band_indexes` parameter of the `raster` reader.
155+
When reading a multiband raster or a _catalog_ describing multiband rasters, you will need to know ahead of time which bands you want to read. You will specify the bands to read, **indexed from zero**, as a list of integers into the `band_indexes` parameter of the `raster` reader.
156156

157157
For example, we can read a four-band (red, green, blue, and near-infrared) image as follows. The individual rows of the resulting DataFrame still represent distinct spatial extents, with a projected raster column for each band specified by `band_indexes`.
158158

pyrasterframes/src/main/python/docs/raster-write.pymd

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ Fortunately, we can use the cluster computing capability to downsample the data
6666

6767
```python write_geotiff
6868
outfile = os.path.join('/tmp', 'geotiff-overview.tif')
69-
spark_df.write.geotiff('file://' + outfile, crs='EPSG:4326', raster_dimensions=(256, 256))
69+
spark_df.write.geotiff(outfile, crs='EPSG:4326', raster_dimensions=(256, 256))
7070
```
7171

7272
View it with `rasterio` to check the results:
@@ -83,6 +83,10 @@ with rasterio.open(outfile) as src:
8383

8484
If there are many tile or projected raster columns in the DataFrame, the GeoTIFF writer will write each one as a separate band in the file. Each band in the output will be tagged the input column names for reference.
8585

86+
```python, echo=False
87+
os.remove(outfile)
88+
```
89+
8690
## GeoTrellis Layers
8791

8892
[GeoTrellis][GeoTrellis] is one of the key libraries that RasterFrames builds upon. It provides a Scala language API to working with large raster data with Apache Spark. Ingesting raster data into a Layer is one of the key concepts for creating a dataset for processing on Spark. RasterFrames writes data from an appropriate DataFrame into a [GeoTrellis Layer](https://geotrellis.readthedocs.io/en/latest/guide/tile-backends.html). RasterFrames provides a `geotrellis` DataSource that supports both @ref:[reading](raster-read.md#geotrellis) and writing of GeoTrellis layers.

pyrasterframes/src/main/python/tests/GeoTiffWriterTests.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,14 @@ def _tmpfile():
3333

3434
def test_identity_write(self):
3535
rf = self.spark.read.geotiff(self.img_uri)
36+
rf_count = rf.count()
37+
self.assertTrue(rf_count > 0)
3638

3739
dest = self._tmpfile()
3840
rf.write.geotiff(dest)
3941

40-
rf2 = self.spark.read.geotiff('file://' + dest)
42+
rf2 = self.spark.read.geotiff(dest)
43+
4144
self.assertEqual(rf2.count(), rf.count())
4245

4346
os.remove(dest)
@@ -47,7 +50,7 @@ def test_unstructured_write(self):
4750
dest_file = self._tmpfile()
4851
rf.write.geotiff(dest_file, crs='EPSG:32616')
4952

50-
rf2 = self.spark.read.raster('file://' + dest_file)
53+
rf2 = self.spark.read.raster(dest_file)
5154
self.assertEqual(rf2.count(), rf.count())
5255

5356
with rasterio.open(self.img_uri) as source:
@@ -58,6 +61,22 @@ def test_unstructured_write(self):
5861

5962
os.remove(dest_file)
6063

64+
def test_unstructured_write_schemeless(self):
65+
# should be able to write a projected raster tile column to path like '/data/foo/file.tif'
66+
from pyrasterframes.rasterfunctions import rf_agg_stats, rf_crs
67+
rf = self.spark.read.raster(self.img_uri)
68+
max = rf.agg(rf_agg_stats('proj_raster').max.alias('max')).first()['max']
69+
crs = rf.select(rf_crs('proj_raster').crsProj4.alias('c')).first()['c']
70+
71+
dest_file = self._tmpfile()
72+
self.assertTrue(not dest_file.startswith('file://'))
73+
rf.write.geotiff(dest_file, crs=crs)
74+
75+
with rasterio.open(dest_file) as src:
76+
self.assertEqual(src.read().max(), max)
77+
78+
os.remove(dest_file)
79+
6180
def test_downsampled_write(self):
6281
rf = self.spark.read.raster(self.img_uri)
6382
dest = self._tmpfile()

pyrasterframes/src/main/python/tests/PyRasterFramesTests.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,13 @@ def l8path(b):
470470
print(path_count.toPandas())
471471
self.assertTrue(path_count.count() == 3)
472472

473+
def test_raster_source_reader_schemeless(self):
474+
import os.path
475+
path = os.path.join(self.resource_dir, "L8-B8-Robinson-IL.tiff")
476+
self.assertTrue(not path.startswith('file://'))
477+
df = self.spark.read.raster(path)
478+
self.assertTrue(df.count() > 0)
479+
473480
def test_raster_source_catalog_reader(self):
474481
import pandas as pd
475482

rf-notebook/src/main/docker/Dockerfile

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ FROM s22s/pyspark-notebook:spark-2.3.3-hadoop-2.7
22

33
MAINTAINER Astraea, Inc.
44

5-
ENV RF_LIB_LOC /usr/local/rasterframes
5+
ENV RF_LIB_LOC=/usr/local/rasterframes \
6+
LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib"
67

78
USER root
89

@@ -14,11 +15,6 @@ EXPOSE 4040 4041 4042 4043 4044
1415
RUN conda install --quiet --yes \
1516
anaconda sphinx nbsphinx shapely numpy folium geopandas geojsonio
1617

17-
RUN apt-get update && \
18-
apt-get install -y gdal-bin && \
19-
apt autoremove && \
20-
apt-get clean all
21-
2218
# Cleanup pip residuals
2319
RUN rm -rf /home/$NB_USER/.local && \
2420
fix-permissions /home/$NB_USER

0 commit comments

Comments
 (0)