|  | 
|  | 1 | +# Licensed to the Apache Software Foundation (ASF) under one | 
|  | 2 | +# or more contributor license agreements.  See the NOTICE file | 
|  | 3 | +# distributed with this work for additional information | 
|  | 4 | +# regarding copyright ownership.  The ASF licenses this file | 
|  | 5 | +# to you under the Apache License, Version 2.0 (the | 
|  | 6 | +# "License"); you may not use this file except in compliance | 
|  | 7 | +# with the License.  You may obtain a copy of the License at | 
|  | 8 | +# | 
|  | 9 | +#   http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 10 | +# | 
|  | 11 | +# Unless required by applicable law or agreed to in writing, | 
|  | 12 | +# software distributed under the License is distributed on an | 
|  | 13 | +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
|  | 14 | +# KIND, either express or implied.  See the License for the | 
|  | 15 | +# specific language governing permissions and limitations | 
|  | 16 | +# under the License. | 
|  | 17 | + | 
|  | 18 | +"""IO read functions using global context.""" | 
|  | 19 | + | 
|  | 20 | +import pathlib | 
|  | 21 | + | 
|  | 22 | +import pyarrow | 
|  | 23 | + | 
|  | 24 | +from datafusion.dataframe import DataFrame | 
|  | 25 | +from datafusion.expr import Expr | 
|  | 26 | + | 
|  | 27 | +from ._internal import SessionContext as SessionContextInternal | 
|  | 28 | + | 
|  | 29 | + | 
|  | 30 | +def read_parquet( | 
|  | 31 | +    path: str | pathlib.Path, | 
|  | 32 | +    table_partition_cols: list[tuple[str, str]] | None = None, | 
|  | 33 | +    parquet_pruning: bool = True, | 
|  | 34 | +    file_extension: str = ".parquet", | 
|  | 35 | +    skip_metadata: bool = True, | 
|  | 36 | +    schema: pyarrow.Schema | None = None, | 
|  | 37 | +    file_sort_order: list[list[Expr]] | None = None, | 
|  | 38 | +) -> DataFrame: | 
|  | 39 | +    """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. | 
|  | 40 | +
 | 
|  | 41 | +    This function will use the global context. Any functions or tables registered | 
|  | 42 | +    with another context may not be accessible when used with a DataFrame created | 
|  | 43 | +    using this function. | 
|  | 44 | +
 | 
|  | 45 | +    Args: | 
|  | 46 | +        path: Path to the Parquet file. | 
|  | 47 | +        table_partition_cols: Partition columns. | 
|  | 48 | +        parquet_pruning: Whether the parquet reader should use the predicate | 
|  | 49 | +            to prune row groups. | 
|  | 50 | +        file_extension: File extension; only files with this extension are | 
|  | 51 | +            selected for data input. | 
|  | 52 | +        skip_metadata: Whether the parquet reader should skip any metadata | 
|  | 53 | +            that may be in the file schema. This can help avoid schema | 
|  | 54 | +            conflicts due to metadata. | 
|  | 55 | +        schema: An optional schema representing the parquet files. If None, | 
|  | 56 | +            the parquet reader will try to infer it based on data in the | 
|  | 57 | +            file. | 
|  | 58 | +        file_sort_order: Sort order for the file. | 
|  | 59 | +
 | 
|  | 60 | +    Returns: | 
|  | 61 | +        DataFrame representation of the read Parquet files | 
|  | 62 | +    """ | 
|  | 63 | +    if table_partition_cols is None: | 
|  | 64 | +        table_partition_cols = [] | 
|  | 65 | +    return DataFrame( | 
|  | 66 | +        SessionContextInternal._global_ctx().read_parquet( | 
|  | 67 | +            str(path), | 
|  | 68 | +            table_partition_cols, | 
|  | 69 | +            parquet_pruning, | 
|  | 70 | +            file_extension, | 
|  | 71 | +            skip_metadata, | 
|  | 72 | +            schema, | 
|  | 73 | +            file_sort_order, | 
|  | 74 | +        ) | 
|  | 75 | +    ) | 
|  | 76 | + | 
|  | 77 | + | 
|  | 78 | +def read_json( | 
|  | 79 | +    path: str | pathlib.Path, | 
|  | 80 | +    schema: pyarrow.Schema | None = None, | 
|  | 81 | +    schema_infer_max_records: int = 1000, | 
|  | 82 | +    file_extension: str = ".json", | 
|  | 83 | +    table_partition_cols: list[tuple[str, str]] | None = None, | 
|  | 84 | +    file_compression_type: str | None = None, | 
|  | 85 | +) -> DataFrame: | 
|  | 86 | +    """Read a line-delimited JSON data source. | 
|  | 87 | +
 | 
|  | 88 | +    This function will use the global context. Any functions or tables registered | 
|  | 89 | +    with another context may not be accessible when used with a DataFrame created | 
|  | 90 | +    using this function. | 
|  | 91 | +
 | 
|  | 92 | +    Args: | 
|  | 93 | +        path: Path to the JSON file. | 
|  | 94 | +        schema: The data source schema. | 
|  | 95 | +        schema_infer_max_records: Maximum number of rows to read from JSON | 
|  | 96 | +            files for schema inference if needed. | 
|  | 97 | +        file_extension: File extension; only files with this extension are | 
|  | 98 | +            selected for data input. | 
|  | 99 | +        table_partition_cols: Partition columns. | 
|  | 100 | +        file_compression_type: File compression type. | 
|  | 101 | +
 | 
|  | 102 | +    Returns: | 
|  | 103 | +        DataFrame representation of the read JSON files. | 
|  | 104 | +    """ | 
|  | 105 | +    if table_partition_cols is None: | 
|  | 106 | +        table_partition_cols = [] | 
|  | 107 | +    return DataFrame( | 
|  | 108 | +        SessionContextInternal._global_ctx().read_json( | 
|  | 109 | +            str(path), | 
|  | 110 | +            schema, | 
|  | 111 | +            schema_infer_max_records, | 
|  | 112 | +            file_extension, | 
|  | 113 | +            table_partition_cols, | 
|  | 114 | +            file_compression_type, | 
|  | 115 | +        ) | 
|  | 116 | +    ) | 
|  | 117 | + | 
|  | 118 | + | 
|  | 119 | +def read_csv( | 
|  | 120 | +    path: str | pathlib.Path | list[str] | list[pathlib.Path], | 
|  | 121 | +    schema: pyarrow.Schema | None = None, | 
|  | 122 | +    has_header: bool = True, | 
|  | 123 | +    delimiter: str = ",", | 
|  | 124 | +    schema_infer_max_records: int = 1000, | 
|  | 125 | +    file_extension: str = ".csv", | 
|  | 126 | +    table_partition_cols: list[tuple[str, str]] | None = None, | 
|  | 127 | +    file_compression_type: str | None = None, | 
|  | 128 | +) -> DataFrame: | 
|  | 129 | +    """Read a CSV data source. | 
|  | 130 | +
 | 
|  | 131 | +    This function will use the global context. Any functions or tables registered | 
|  | 132 | +    with another context may not be accessible when used with a DataFrame created | 
|  | 133 | +    using this function. | 
|  | 134 | +
 | 
|  | 135 | +    Args: | 
|  | 136 | +        path: Path to the CSV file | 
|  | 137 | +        schema: An optional schema representing the CSV files. If None, the | 
|  | 138 | +            CSV reader will try to infer it based on data in file. | 
|  | 139 | +        has_header: Whether the CSV file have a header. If schema inference | 
|  | 140 | +            is run on a file with no headers, default column names are | 
|  | 141 | +            created. | 
|  | 142 | +        delimiter: An optional column delimiter. | 
|  | 143 | +        schema_infer_max_records: Maximum number of rows to read from CSV | 
|  | 144 | +            files for schema inference if needed. | 
|  | 145 | +        file_extension:  File extension; only files with this extension are | 
|  | 146 | +            selected for data input. | 
|  | 147 | +        table_partition_cols:  Partition columns. | 
|  | 148 | +        file_compression_type:  File compression type. | 
|  | 149 | +
 | 
|  | 150 | +    Returns: | 
|  | 151 | +        DataFrame representation of the read CSV files | 
|  | 152 | +    """ | 
|  | 153 | +    if table_partition_cols is None: | 
|  | 154 | +        table_partition_cols = [] | 
|  | 155 | + | 
|  | 156 | +    path = [str(p) for p in path] if isinstance(path, list) else str(path) | 
|  | 157 | + | 
|  | 158 | +    return DataFrame( | 
|  | 159 | +        SessionContextInternal._global_ctx().read_csv( | 
|  | 160 | +            path, | 
|  | 161 | +            schema, | 
|  | 162 | +            has_header, | 
|  | 163 | +            delimiter, | 
|  | 164 | +            schema_infer_max_records, | 
|  | 165 | +            file_extension, | 
|  | 166 | +            table_partition_cols, | 
|  | 167 | +            file_compression_type, | 
|  | 168 | +        ) | 
|  | 169 | +    ) | 
|  | 170 | + | 
|  | 171 | + | 
|  | 172 | +def read_avro( | 
|  | 173 | +    path: str | pathlib.Path, | 
|  | 174 | +    schema: pyarrow.Schema | None = None, | 
|  | 175 | +    file_partition_cols: list[tuple[str, str]] | None = None, | 
|  | 176 | +    file_extension: str = ".avro", | 
|  | 177 | +) -> DataFrame: | 
|  | 178 | +    """Create a :py:class:`DataFrame` for reading Avro data source. | 
|  | 179 | +
 | 
|  | 180 | +    This function will use the global context. Any functions or tables registered | 
|  | 181 | +    with another context may not be accessible when used with a DataFrame created | 
|  | 182 | +    using this function. | 
|  | 183 | +
 | 
|  | 184 | +    Args: | 
|  | 185 | +        path: Path to the Avro file. | 
|  | 186 | +        schema: The data source schema. | 
|  | 187 | +        file_partition_cols: Partition columns. | 
|  | 188 | +        file_extension: File extension to select. | 
|  | 189 | +
 | 
|  | 190 | +    Returns: | 
|  | 191 | +        DataFrame representation of the read Avro file | 
|  | 192 | +    """ | 
|  | 193 | +    if file_partition_cols is None: | 
|  | 194 | +        file_partition_cols = [] | 
|  | 195 | +    return DataFrame( | 
|  | 196 | +        SessionContextInternal._global_ctx().read_avro( | 
|  | 197 | +            str(path), schema, file_partition_cols, file_extension | 
|  | 198 | +        ) | 
|  | 199 | +    ) | 
0 commit comments