Skip to content

Commit e1e0272

Browse files
authored
Merge pull request #2 from rahulj51/delta-lake
Adding support for delta lake table format
2 parents 6d04262 + c6f04d5 commit e1e0272

File tree

6 files changed

+67
-24
lines changed

6 files changed

+67
-24
lines changed

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,15 @@ _youcanevenwritestraighttos3_ 🤭
8484
fake -n 10 pyint,user_name,date_this_year -f parquet -o s3://YOUR_BUCKET/data/sample.parquet
8585
```
8686

87+
### Delta Lake
88+
89+
Data can be exported as a delta lake table.
90+
91+
```bash
92+
fake -n 10 pyint,user_name,date_this_year -f deltalake -o sample_data
93+
```
94+
95+
8796
## Templates
8897

8998
Want to generate 1 MILLION S3 Access logs in ~2 minutes? Now you can.

faker_cli/cli.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import sys
44
from faker_cli.templates import CloudFrontWriter, S3AccessLogs, S3AccessWriter, CloudTrailLogs, CloudFrontLogs
55

6-
from faker_cli.writer import CSVWriter, JSONWriter, ParquetWriter
6+
from faker_cli.writer import CSVWriter, JSONWriter, ParquetWriter, DeltaLakeWriter
77
from typing import List
88

99
def infer_column_names(col_names, col_types: str) -> List[str]:
@@ -20,6 +20,7 @@ def infer_column_names(col_names, col_types: str) -> List[str]:
2020
"csv": CSVWriter,
2121
"json": JSONWriter,
2222
"parquet": ParquetWriter,
23+
"deltalake": DeltaLakeWriter
2324
}
2425

2526
TEMPLATE_MAPPER = {
@@ -33,7 +34,7 @@ def infer_column_names(col_names, col_types: str) -> List[str]:
3334

3435
@click.command()
3536
@click.option("--num-rows", "-n", default=1, help="Number of rows")
36-
@click.option("--format", "-f", type=click.Choice(["csv", "json", "parquet"]), default="csv", help="Format of the output")
37+
@click.option("--format", "-f", type=click.Choice(["csv", "json", "parquet", "deltalake"]), default="csv", help="Format of the output")
3738
@click.option("--output", "-o", type=click.Path(writable=True))
3839
@click.option("--columns", "-c", help="Column names", default=None, required=False)
3940
@click.option("--template", "-t", help="Template to use", type=click.Choice(["s3access", "cloudfront"]), default=None)
@@ -57,9 +58,9 @@ def main(num_rows, format, output, columns, template, column_types):
5758
)
5859

5960
# Parquet output requires a filename
60-
if format == "parquet" and output is None:
61-
raise click.BadArgumentUsage("parquet format requires --output/-o filename parameter.")
62-
if output is not None and format != "parquet":
61+
if format in ["parquet", "deltalake"] and output is None:
62+
raise click.BadArgumentUsage("parquet | deltalake formats requires --output/-o filename parameter.")
63+
if output is not None and format not in ["parquet", "deltalake"]:
6364
raise click.BadArgumentUsage("output files not supported for csv/json yet.")
6465

6566
# If the user provides a template, we use that provider and writer and exit.

faker_cli/writer.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import Optional
44
import pyarrow as pa
55
import pyarrow.parquet as pq
6+
import deltalake
67

78

89
class Writer:
@@ -56,3 +57,8 @@ def write(self, row):
5657

5758
def close(self):
5859
pq.write_table(self.table, self.filename)
60+
61+
62+
class DeltaLakeWriter(ParquetWriter):
63+
def close(self):
64+
deltalake.write_deltalake(table_or_uri=self.filename, data=self.table)

poetry.lock

Lines changed: 29 additions & 18 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ python = "^3.9"
1111
faker = "^18.9.0"
1212
click = "^8.1.3"
1313
pyarrow = "^12.0.0"
14+
deltalake = "^0.9.0"
1415

1516

1617
[tool.poetry.group.dev.dependencies]

tests/test_cli.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from faker_cli.cli import main
22
from click.testing import CliRunner
33
import json
4+
import deltalake
45

56

67
# Test that help is provided if the user provides no arguments
@@ -50,4 +51,18 @@ def test_custom_column_names():
5051
lines = result.output.strip().splitlines()
5152
data: dict = json.loads(lines[0])
5253
assert len(data.keys()) == 2
53-
assert list(data) == ["first", "second"]
54+
assert list(data) == ["first", "second"]
55+
56+
def test_deltalake_output(tmp_path):
57+
runner = CliRunner()
58+
file = tmp_path / 'table'
59+
result = runner.invoke(main, ["pyint,user_name", "-f", "deltalake", "-o", file])
60+
assert result.exit_code == 0
61+
delta_table = deltalake.DeltaTable(file)
62+
arrow_table = delta_table.to_pyarrow_table()
63+
lines_count = arrow_table.num_rows
64+
assert lines_count == 1
65+
66+
column_names = arrow_table.column_names
67+
assert column_names == ["pyint", "user_name"]
68+
assert arrow_table.num_columns == 2

0 commit comments

Comments
 (0)