googleapis
diff --git a/‎bigframes/core/compile/sqlglot/compiler.py
Lines changed: 3 additions & 8 deletions b/‎bigframes/core/compile/sqlglot/compiler.py
Lines changed: 3 additions & 8 deletions
diff --git a/‎bigframes/core/compile/sqlglot/sqlglot_ir.py
Lines changed: 49 additions & 19 deletions b/‎bigframes/core/compile/sqlglot/sqlglot_ir.py
Lines changed: 49 additions & 19 deletions
diff --git a/‎tests/data/scalars.jsonl
Lines changed: 9 additions & 9 deletions b/‎tests/data/scalars.jsonl
Lines changed: 9 additions & 9 deletions
diff --git a/‎tests/unit/core/compile/sqlglot/conftest.py
Lines changed: 82 additions & 10 deletions b/‎tests/unit/core/compile/sqlglot/conftest.py
Lines changed: 82 additions & 10 deletions
@@ -146,22 +146,17 @@ def _compile_node(
 
 @_compile_node.register
 def compile_readlocal(node: nodes.ReadLocalNode, *args) -> ir.SQLGlotIR:
-    offsets = node.offsets_col.sql if node.offsets_col else None
-    schema_names = node.schema.names
-    schema_dtypes = node.schema.dtypes
-
     pa_table = node.local_data_source.data
     pa_table = pa_table.select([item.source_id for item in node.scan_list.items])
-    pa_table = pa_table.rename_columns(
-        {item.source_id: item.id.sql for item in node.scan_list.items}
-    )
+    pa_table = pa_table.rename_columns([item.id.sql for item in node.scan_list.items])
 
+    offsets = node.offsets_col.sql if node.offsets_col else None
     if offsets:
         pa_table = pa_table.append_column(
             offsets, pa.array(range(pa_table.num_rows), type=pa.int64())
         )
 
-    return ir.SQLGlotIR.from_pandas(pa_table.to_pandas(), schema_names, schema_dtypes)
+    return ir.SQLGlotIR.from_pyarrow(pa_table, node.schema)
 
 
 @_compile_node.register
 
@@ -17,13 +17,23 @@
 import dataclasses
 import typing
 
-import pandas as pd
+import pyarrow as pa
 import sqlglot as sg
 import sqlglot.dialects.bigquery
 import sqlglot.expressions as sge
 
 from bigframes import dtypes
 import bigframes.core.compile.sqlglot.sqlglot_types as sgt
+import bigframes.core.local_data as local_data
+import bigframes.core.schema as schemata
+
+# shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0.
+try:
+    from shapely.io import to_wkt  # type: ignore
+except ImportError:
+    from shapely.wkt import dumps  # type: ignore
+
+    to_wkt = dumps
 
 
 @dataclasses.dataclass(frozen=True)
@@ -48,35 +58,32 @@ def sql(self) -> str:
         return self.expr.sql(dialect=self.dialect, pretty=self.pretty)
 
     @classmethod
-    def from_pandas(
-        cls,
-        pd_df: pd.DataFrame,
-        schema_names: typing.Sequence[str],
-        schema_dtypes: typing.Sequence[dtypes.Dtype],
+    def from_pyarrow(
+        cls, pa_table: pa.Table, schema: schemata.ArraySchema
     ) -> SQLGlotIR:
         """Builds SQLGlot expression from pyarrow table."""
         dtype_expr = sge.DataType(
             this=sge.DataType.Type.STRUCT,
             expressions=[
                 sge.ColumnDef(
-                    this=sge.to_identifier(name, quoted=True),
-                    kind=sgt.SQLGlotType.from_bigframes_dtype(dtype),
+                    this=sge.to_identifier(field.column, quoted=True),
+                    kind=sgt.SQLGlotType.from_bigframes_dtype(field.dtype),
                 )
-                for name, dtype in zip(schema_names, schema_dtypes)
+                for field in schema.items
             ],
             nested=True,
         )
         data_expr = [
-            sge.Tuple(
+            sge.Struct(
                 expressions=tuple(
                     _literal(
                         value=value,
-                        dtype=sgt.SQLGlotType.from_bigframes_dtype(dtype),
+                        dtype=field.dtype,
                     )
-                    for value, dtype in zip(row, schema_dtypes)
+                    for value, field in zip(tuple(row_dict.values()), schema.items)
                 )
             )
-            for _, row in pd_df.iterrows()
+            for row_dict in local_data._iter_table(pa_table, schema)
         ]
         expr = sge.Unnest(
             expressions=[
@@ -105,13 +112,36 @@ def select(
         return SQLGlotIR(expr=expr)
 
 
-def _literal(value: typing.Any, dtype: str) -> sge.Expression:
+def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression:
+    sqlglot_type = sgt.SQLGlotType.from_bigframes_dtype(dtype)
     if value is None:
-        return _cast(sge.Null(), dtype)
-
-    # TODO: handle other types like visit_DefaultLiteral
-    return sge.convert(value)
+        return _cast(sge.Null(), sqlglot_type)
+    elif dtype == dtypes.BYTES_DTYPE:
+        return _cast(str(value), sqlglot_type)
+    elif dtypes.is_time_like(dtype):
+        return _cast(sge.convert(value.isoformat()), sqlglot_type)
+    elif dtypes.is_geo_like(dtype):
+        wkt = value if isinstance(value, str) else to_wkt(value)
+        return sge.func("ST_GEOGFROMTEXT", sge.convert(wkt))
+    elif dtype == dtypes.JSON_DTYPE:
+        return sge.ParseJSON(this=sge.convert(str(value)))
+    elif dtypes.is_struct_like(dtype):
+        items = [
+            _literal(value=value[field_name], dtype=field_dtype).as_(
+                field_name, quoted=True
+            )
+            for field_name, field_dtype in dtypes.get_struct_fields(dtype).items()
+        ]
+        return sge.Struct.from_arg_list(items)
+    elif dtypes.is_array_like(dtype):
+        value_type = dtypes.get_array_inner_type(dtype)
+        values = sge.Array(
+            expressions=[_literal(value=v, dtype=value_type) for v in value]
+        )
+        return values if len(value) > 0 else _cast(values, sqlglot_type)
+    else:
+        return sge.convert(value)
 
 
-def _cast(arg, to) -> sge.Cast:
+def _cast(arg: typing.Any, to: str) -> sge.Cast:
     return sge.Cast(this=arg, to=to)
@@ -1,9 +1,9 @@
-{"bool_col": true,  "bytes_col": "SGVsbG8sIFdvcmxkIQ==",     "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)",             "int64_col": "123456789",  "int64_too": "0",     "numeric_col": "1.23456789",  "float64_col": "1.25",     "rowindex": 0, "rowindex_2": 0, "string_col": "Hello, World!",     "time_col": "11:41:43.076160", "timestamp_col": "2021-07-21T17:43:43.945289Z"}
-{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv",     "date_col": "1991-02-03", "datetime_col": "1991-01-02 03:45:06", "geography_col": "POINT(-71.104 42.315)",                      "int64_col": "-987654321", "int64_too": "1",     "numeric_col": "1.23456789",  "float64_col": "2.51",     "rowindex": 1, "rowindex_2": 1, "string_col": "こんにちは",          "time_col": "11:14:34.701606", "timestamp_col": "2021-07-21T17:43:43.945289Z"}
-{"bool_col": true,  "bytes_col": "wqFIb2xhIE11bmRvIQ==",     "date_col": "2023-03-01", "datetime_col": "2023-03-01 10:55:13", "geography_col": "POINT(-0.124474760143016 51.5007826749545)", "int64_col": "314159",     "int64_too": "0",     "numeric_col": "101.1010101", "float64_col": "2.5e10",   "rowindex": 2, "rowindex_2": 2, "string_col": "  ¡Hola Mundo!  ",  "time_col": "23:59:59.999999", "timestamp_col": "2023-03-01T10:55:13.250125Z"}
-{"bool_col": null,  "bytes_col": null,                       "date_col": null,         "datetime_col": null,                  "geography_col": null,                                         "int64_col": null,         "int64_too": "1",     "numeric_col": null,          "float64_col": null,       "rowindex": 3, "rowindex_2": 3, "string_col": null,                "time_col": null,              "timestamp_col": null}
-{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv",     "date_col": "2021-07-21", "datetime_col": null,                  "geography_col": null,                                         "int64_col": "-234892",    "int64_too": "-2345", "numeric_col": null,          "float64_col": null,       "rowindex": 4, "rowindex_2": 4, "string_col": "Hello, World!",     "time_col": null,              "timestamp_col": null}
-{"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==",         "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null,                                         "int64_col": "55555",      "int64_too": "0",     "numeric_col": "5.555555",    "float64_col": "555.555",  "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!",        "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z"}
-{"bool_col": true,  "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "MULTIPOINT (20 20, 10 40, 40 30, 30 10)",    "int64_col": "101202303",  "int64_too": "2",     "numeric_col": "-10.090807",  "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z"}
-{"bool_col": true,  "bytes_col": null,                       "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null,                                         "int64_col": "-214748367", "int64_too": "2",     "numeric_col": "11111111.1",  "float64_col": "42.42",    "rowindex": 7, "rowindex_2": 7, "string_col": " سلام",               "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z"}
-{"bool_col": false, "bytes_col": null,                       "date_col": null,         "datetime_col": null,                  "geography_col": null,                                         "int64_col": "2",          "int64_too": "1",     "numeric_col": null,          "float64_col": "6.87",     "rowindex": 8, "rowindex_2": 8, "string_col": "T",                 "time_col": null,              "timestamp_col": null}
+{"bool_col": true,  "bytes_col": "SGVsbG8sIFdvcmxkIQ==",     "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)",                       "int64_col": "123456789",  "int64_too": "0",     "numeric_col": "1.23456789",  "float64_col": "1.25",     "rowindex": 0, "rowindex_2": 0, "string_col": "Hello, World!",     "time_col": "11:41:43.076160", "timestamp_col": "2021-07-21T17:43:43.945289Z"}
+{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv",     "date_col": "1991-02-03", "datetime_col": "1991-01-02 03:45:06", "geography_col": "POINT(-71.104 42.315)",                                "int64_col": "-987654321", "int64_too": "1",     "numeric_col": "1.23456789",  "float64_col": "2.51",     "rowindex": 1, "rowindex_2": 1, "string_col": "こんにちは",          "time_col": "11:14:34.701606", "timestamp_col": "2021-07-21T17:43:43.945289Z"}
+{"bool_col": true,  "bytes_col": "wqFIb2xhIE11bmRvIQ==",     "date_col": "2023-03-01", "datetime_col": "2023-03-01 10:55:13", "geography_col": "POINT(-0.124474760143016 51.5007826749545)",           "int64_col": "314159",     "int64_too": "0",     "numeric_col": "101.1010101", "float64_col": "2.5e10",   "rowindex": 2, "rowindex_2": 2, "string_col": "  ¡Hola Mundo!  ",  "time_col": "23:59:59.999999", "timestamp_col": "2023-03-01T10:55:13.250125Z"}
+{"bool_col": null,  "bytes_col": null,                       "date_col": null,         "datetime_col": null,                  "geography_col": null,                                                   "int64_col": null,         "int64_too": "1",     "numeric_col": null,          "float64_col": null,       "rowindex": 3, "rowindex_2": 3, "string_col": null,                "time_col": null,              "timestamp_col": null}
+{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv",     "date_col": "2021-07-21", "datetime_col": null,                  "geography_col": null,                                                   "int64_col": "-234892",    "int64_too": "-2345", "numeric_col": null,          "float64_col": null,       "rowindex": 4, "rowindex_2": 4, "string_col": "Hello, World!",     "time_col": null,              "timestamp_col": null}
+{"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==",         "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null,                                                   "int64_col": "55555",      "int64_too": "0",     "numeric_col": "5.555555",    "float64_col": "555.555",  "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!",        "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z"}
+{"bool_col": true,  "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "LINESTRING(-0.127959 51.507728, -0.127026 51.507473)", "int64_col": "101202303",  "int64_too": "2",     "numeric_col": "-10.090807",  "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z"}
+{"bool_col": true,  "bytes_col": null,                       "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null,                                                   "int64_col": "-214748367", "int64_too": "2",     "numeric_col": "11111111.1",  "float64_col": "42.42",    "rowindex": 7, "rowindex_2": 7, "string_col": " سلام",               "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z"}
+{"bool_col": false, "bytes_col": null,                       "date_col": null,         "datetime_col": null,                  "geography_col": null,                                                   "int64_col": "2",          "int64_too": "1",     "numeric_col": null,          "float64_col": "6.87",     "rowindex": 8, "rowindex_2": 8, "string_col": "T",                 "time_col": null,              "timestamp_col": null}
@@ -12,29 +12,101 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pathlib
+
 import pandas as pd
+import pyarrow as pa
 import pytest
 
+from bigframes import dtypes
+import tests.system.utils
+
+CURRENT_DIR = pathlib.Path(__file__).parent
+DATA_DIR = CURRENT_DIR.parent.parent.parent.parent / "data"
+
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="session")
 def compiler_session():
     from . import compiler_session
 
     return compiler_session.SQLCompilerSession()
 
 
-@pytest.fixture(scope="module")
-def all_types_df() -> pd.DataFrame:
-    # TODO: all types pandas dataframes
+@pytest.fixture(scope="session")
+def scalars_types_pandas_df() -> pd.DataFrame:
+    """Returns a pandas DataFrame containing all scalar types and using the `rowindex`
+    column as the index."""
     # TODO: add tests for empty dataframes
+    df = pd.read_json(
+        DATA_DIR / "scalars.jsonl",
+        lines=True,
+    )
+    tests.system.utils.convert_pandas_dtypes(df, bytes_col=True)
+
+    df = df.set_index("rowindex", drop=False)
+    return df
+
+
+@pytest.fixture(scope="session")
+def nested_structs_pandas_df() -> pd.DataFrame:
+    """Returns a pandas DataFrame containing STRUCT types and using the `id`
+    column as the index."""
+
+    df = pd.read_json(
+        DATA_DIR / "nested_structs.jsonl",
+        lines=True,
+    )
+    df = df.set_index("id")
+
+    address_struct_schema = pa.struct(
+        [pa.field("city", pa.string()), pa.field("country", pa.string())]
+    )
+    person_struct_schema = pa.struct(
+        [
+            pa.field("name", pa.string()),
+            pa.field("age", pa.int64()),
+            pa.field("address", address_struct_schema),
+        ]
+    )
+    df["person"] = df["person"].astype(pd.ArrowDtype(person_struct_schema))
+    return df
+
+
+@pytest.fixture(scope="session")
+def repeated_pandas_df() -> pd.DataFrame:
+    """Returns a pandas DataFrame containing LIST types and using the `rowindex`
+    column as the index."""
+
+    df = pd.read_json(
+        DATA_DIR / "repeated.jsonl",
+        lines=True,
+    )
+    df = df.set_index("rowindex")
+    return df
+
+
+@pytest.fixture(scope="session")
+def json_pandas_df() -> pd.DataFrame:
+    """Returns a pandas DataFrame containing JSON types and using the `rowindex`
+    column as the index."""
+    json_data = [
+        "null",
+        "true",
+        "100",
+        "0.98",
+        '"a string"',
+        "[]",
+        "[1, 2, 3]",
+        '[{"a": 1}, {"a": 2}, {"a": null}, {}]',
+        '"100"',
+        '{"date": "2024-07-16"}',
+        '{"int_value": 2, "null_filed": null}',
+        '{"list_data": [10, 20, 30]}',
+    ]
     df = pd.DataFrame(
         {
-            "int1": pd.Series([1, 2, 3], dtype="Int64"),
-            "int2": pd.Series([-10, 20, 30], dtype="Int64"),
-            "bools": pd.Series([True, None, False], dtype="boolean"),
-            "strings": pd.Series(["b", "aa", "ccc"], dtype="string[pyarrow]"),
+            "json_col": pd.Series(json_data, dtype=dtypes.JSON_DTYPE),
         },
+        index=pd.Series(range(len(json_data)), dtype=dtypes.INT_DTYPE),
     )
-    # add more complexity index.
-    df.index = df.index.astype("Int64")
     return df