Implement py raster read list of lists

vpipkt · vpipkt · commit 5f97322c052b · 2019-08-22T14:05:12.000-04:00
Signed-off-by: Jason T. Brown &lt;jason@astraea.earth&gt;
diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py
@@ -162,6 +162,16 @@ def temp_name():
             path = None
             catalog = None
             options.update(dict(paths='\n'.join([str(i) for i in source])))  # pass in "uri1\nuri2\nuri3\n..."
+        if all([isinstance(i, list) for i in source]):
+            # list of lists; we will rely on pandas to
+            #   - coerce all data to str (possibly using objects' __str__ or __repr__\
+            #   - ensure data is not "ragged": all sublists are same len
+            path = None
+            catalog_col_names = ['proj_raster_{}'.format(i) for i in range(len(source[0]))]
+            catalog = PdDataFrame(source,
+                                  columns=catalog_col_names,
+                                  dtype=str,
+                                  )
     elif isinstance(source, str):
         if '\n' in source or '\r' in source:
             # then the `source` string is a catalog as a CSV (header is required)
@@ -172,12 +182,13 @@ def temp_name():
             path = source
             catalog = None
     else:
-        # user has passed in some other type, we will interpret as a catalog
+        # user has passed in some other type, we will try to interpret as a catalog
         catalog = source
 
     if catalog is not None:
         if catalog_col_names is None:
             raise Exception("'catalog_col_names' required when DataFrame 'catalog' specified")
+
         if isinstance(catalog, str):
             options.update({
                 "catalogCSV": catalog,
diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTest.py b/pyrasterframes/src/main/python/tests/RasterSourceTest.py
@@ -105,9 +105,19 @@ def l8path(b):
         print(path_count.collect())
         self.assertTrue(path_count.count() == 3)
 
-    @skip('not implemented yet')
     def test_list_of_list_of_str(self):
-        0
+        lol = [
+            [self.path(1, 1), self.path(1, 2), ],
+            [self.path(2, 1), self.path(2, 2), ],
+            [self.path(3, 1), self.path(3, 2), ]
+        ]
+        df = self.spark.read.raster(lol)
+        self.assertTrue(len(df.columns) == 4)  # 2 cols of uris plus 2 cols of proj_rasters
+        self.assertEqual(sorted(df.columns), sorted(['proj_raster_0_path', 'proj_raster_1_path',
+                                                     'proj_raster_0', 'proj_raster_1']))
+        uri_df = df.select('proj_raster_0_path', 'proj_raster_1_path').distinct().collect()
+        uri_list = [list(r.asDict().values()) for r in uri_df]
+        self.assertEqual(sorted(uri_list), sorted(lol))
 
     def test_schemeless_string(self):
         import os.path
@@ -186,4 +196,4 @@ def test_csv_string(self):
 
         df = self.spark.read.raster(s, ['b1', 'b2'])
         self.assertEqual(len(df.columns), 3 + 2)  # number of columns in original DF plus cardinality of catalog_col_names
-        self.assertTrue(len(df.take(1)))
+        self.assertTrue(len(df.take(1)))  # non-empty check