diff --git a/contrib/templates/file-push/README.md b/contrib/templates/file-push/README.md index 46b524e..1cb6c40 100644 --- a/contrib/templates/file-push/README.md +++ b/contrib/templates/file-push/README.md @@ -7,4 +7,14 @@ Install it using databricks bundle init --template-dir contrib/templates/file-push https://github.com/databricks/bundle-examples ``` -and follow the generated README.md to get started. \ No newline at end of file +During initialization, you'll be prompted to configure: +- **Catalog and schema** where tables will be created +- **Table definitions** including: + - Table names and file formats (csv, json, avro, parquet) + - Format-specific options (CSV delimiters, JSON parsing options, etc.) + - Schema evolution settings + - Optional schema hints + +The template supports configuring up to 4 tables during initialization. Additional tables can be added later by editing `./src/configs/tables.json`. + +After initialization, follow the generated README.md to deploy and start ingesting data. \ No newline at end of file diff --git a/contrib/templates/file-push/databricks_template_schema.json b/contrib/templates/file-push/databricks_template_schema.json index 6e75b02..4a4005b 100644 --- a/contrib/templates/file-push/databricks_template_schema.json +++ b/contrib/templates/file-push/databricks_template_schema.json @@ -7,7 +7,7 @@ "order": 1, "default": "main", "pattern": "^[a-z_][a-z0-9_-]{0,254}$", - "pattern_match_failure_message": "Name must only consist of letters, numbers, dashes, and underscores." + "pattern_match_failure_message": "Name must only consist of lowercase letters, numbers, dashes, and underscores." }, "schema_name": { "type": "string", @@ -15,8 +15,1246 @@ "order": 2, "default": "filepushschema", "pattern": "^[a-z_][a-z0-9_-]{0,254}$", - "pattern_match_failure_message": "Name must only consist of letters, numbers, dashes, and underscores." + "pattern_match_failure_message": "Schema name must only consist of lowercase letters, numbers, dashes and underscores." + }, + "table_1_name": { + "type": "string", + "description": "\n=== Table 1 Configuration ===\n\nTable name\nTable 1 - Name", + "order": 10, + "default": "table1", + "pattern": "^[a-z_][a-z0-9_-]{0,254}$", + "pattern_match_failure_message": "Table name must only consist of lowercase letters, numbers, underscores, and dashes." + }, + "table_1_format": { + "type": "string", + "description": "\nFile format (csv, json, avro, or parquet)\nTable 1 - Format", + "order": 11, + "default": "csv", + "enum": ["csv", "json", "avro", "parquet"] + }, + "table_1_csv_header": { + "skip_prompt_if": { + "properties": { + "table_1_format": { + "enum": ["json", "avro", "parquet"] + } + } + }, + "type": "string", + "description": "\nDoes the CSV file have a header row?\nTable 1 - CSV Header", + "order": 12, + "default": "true", + "enum": ["true", "false"] + }, + "table_1_csv_delimiter": { + "skip_prompt_if": { + "properties": { + "table_1_format": { + "enum": ["json", "avro", "parquet"] + } + } + }, + "type": "string", + "description": "\nField delimiter character\nTable 1 - CSV Delimiter", + "order": 13, + "default": ",", + "enum": [",", ";", "|", "\t"] + }, + "table_1_csv_escape": { + "skip_prompt_if": { + "properties": { + "table_1_format": { + "enum": ["json", "avro", "parquet"] + } + } + }, + "type": "string", + "description": "\nEscape character\nTable 1 - CSV Escape", + "order": 14, + "default": "\"", + "enum": ["\"", "\\"] + }, + "table_1_csv_multiline": { + "skip_prompt_if": { + "properties": { + "table_1_format": { + "enum": ["json", "avro", "parquet"] + } + } + }, + "type": "string", + "description": "\nAllow multiline values?\nTable 1 - CSV Multiline", + "order": 15, + "default": "false", + "enum": ["true", "false"] + }, + "table_1_json_allow_comments": { + "skip_prompt_if": { + "properties": { + "table_1_format": { + "enum": ["csv", "avro", "parquet"] + } + } + }, + "type": "string", + "description": "\nAllow comments in JSON?\nTable 1 - JSON Allow Comments", + "order": 12, + "default": "true", + "enum": ["true", "false"] + }, + "table_1_json_allow_single_quotes": { + "skip_prompt_if": { + "properties": { + "table_1_format": { + "enum": ["csv", "avro", "parquet"] + } + } + }, + "type": "string", + "description": "\nAllow single quotes?\nTable 1 - JSON Allow Single Quotes", + "order": 13, + "default": "true", + "enum": ["true", "false"] + }, + "table_1_json_infer_timestamp": { + "skip_prompt_if": { + "properties": { + "table_1_format": { + "enum": ["csv", "avro", "parquet"] + } + } + }, + "type": "string", + "description": "\nInfer timestamp format?\nTable 1 - JSON Infer Timestamp", + "order": 14, + "default": "true", + "enum": ["true", "false"] + }, + "table_1_json_multiline": { + "skip_prompt_if": { + "properties": { + "table_1_format": { + "enum": ["csv", "avro", "parquet"] + } + } + }, + "type": "string", + "description": "\nAllow multiline JSON?\nTable 1 - JSON Multiline", + "order": 15, + "default": "true", + "enum": ["true", "false"] + }, + "table_1_schema_hints": { + "type": "string", + "description": "\n[Optional] Schema hints (e.g., 'id int, name string'). Leave empty for auto-detection.\nTable 1 - Schema Hints", + "order": 16, + "default": "" + }, + "table_1_schema_evolution_mode": { + "type": "string", + "description": "\nSchema evolution mode\nTable 1 - Schema Evolution Mode", + "order": 17, + "default": "addNewColumns", + "enum": ["addNewColumns", "rescue", "none"] + }, + "add_table_2": { + "type": "string", + "description": "\nWould you like to add another table?\nAdd Table 2", + "order": 19, + "default": "yes", + "enum": ["yes", "no"] + }, + "table_2_name": { + "skip_prompt_if": { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + "type": "string", + "description": "\n=== Table 2 Configuration ===\n\nTable name\nTable 2 - Name", + "order": 20, + "default": "table2", + "pattern": "^[a-z_][a-z0-9_-]{0,254}$", + "pattern_match_failure_message": "Table name must only consist of lowercase letters, numbers, underscores, and dashes." + }, + "table_2_format": { + "skip_prompt_if": { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + "type": "string", + "description": "\nFile format\nTable 2 - Format", + "order": 21, + "default": "json", + "enum": ["csv", "json", "avro", "parquet"] + }, + "table_2_csv_header": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "table_2_format": { + "enum": ["json", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nDoes the CSV file have a header row?\nTable 2 - CSV Header", + "order": 22, + "default": "true", + "enum": ["true", "false"] + }, + "table_2_csv_delimiter": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "table_2_format": { + "enum": ["json", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nField delimiter character\nTable 2 - CSV Delimiter", + "order": 23, + "default": ",", + "enum": [",", ";", "|", "\t"] + }, + "table_2_csv_escape": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "table_2_format": { + "enum": ["json", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nEscape character\nTable 2 - CSV Escape", + "order": 24, + "default": "\"", + "enum": ["\"", "\\"] + }, + "table_2_csv_multiline": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "table_2_format": { + "enum": ["json", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nAllow multiline values?\nTable 2 - CSV Multiline", + "order": 25, + "default": "false", + "enum": ["true", "false"] + }, + "table_2_json_allow_comments": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "table_2_format": { + "enum": ["csv", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nAllow comments in JSON?\nTable 2 - JSON Allow Comments", + "order": 22, + "default": "true", + "enum": ["true", "false"] + }, + "table_2_json_allow_single_quotes": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "table_2_format": { + "enum": ["csv", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nAllow single quotes?\nTable 2 - JSON Allow Single Quotes", + "order": 23, + "default": "true", + "enum": ["true", "false"] + }, + "table_2_json_infer_timestamp": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "table_2_format": { + "enum": ["csv", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nInfer timestamp format?\nTable 2 - JSON Infer Timestamp", + "order": 24, + "default": "true", + "enum": ["true", "false"] + }, + "table_2_json_multiline": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "table_2_format": { + "enum": ["csv", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nAllow multiline JSON?\nTable 2 - JSON Multiline", + "order": 25, + "default": "true", + "enum": ["true", "false"] + }, + "table_2_schema_hints": { + "skip_prompt_if": { + "properties": { + "num_tables": { + "const": "1" + } + } + }, + "type": "string", + "description": "\n[Optional] Schema hints. Leave empty for auto-detection.\nTable 2 - Schema Hints", + "order": 26, + "default": "" + }, + "table_2_schema_evolution_mode": { + "skip_prompt_if": { + "properties": { + "num_tables": { + "const": "1" + } + } + }, + "type": "string", + "description": "\nSchema evolution mode\nTable 2 - Schema Evolution Mode", + "order": 27, + "default": "addNewColumns", + "enum": ["addNewColumns", "rescue", "none"] + }, + "add_table_3": { + "skip_prompt_if": { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + "type": "string", + "description": "\nWould you like to add another table?\nAdd Table 3", + "order": 29, + "default": "no", + "enum": ["yes", "no"] + }, + "table_3_name": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + } + ] + }, + "type": "string", + "description": "\n=== Table 3 Configuration ===\n\nTable name\nTable 3 - Name", + "order": 30, + "default": "table3", + "pattern": "^[a-z_][a-z0-9_-]{0,254}$", + "pattern_match_failure_message": "Table name must only consist of lowercase letters, numbers, underscores, and dashes." + }, + "table_3_format": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + } + ] + }, + "type": "string", + "description": "\nFile format\nTable 3 - Format", + "order": 31, + "default": "avro", + "enum": ["csv", "json", "avro", "parquet"] + }, + "table_3_csv_header": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "table_3_format": { + "enum": ["json", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nDoes the CSV file have a header row?\nTable 3 - CSV Header", + "order": 32, + "default": "true", + "enum": ["true", "false"] + }, + "table_3_csv_delimiter": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "table_3_format": { + "enum": ["json", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nField delimiter character\nTable 3 - CSV Delimiter", + "order": 33, + "default": ",", + "enum": [",", ";", "|", "\t"] + }, + "table_3_csv_escape": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "table_3_format": { + "enum": ["json", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nEscape character\nTable 3 - CSV Escape", + "order": 34, + "default": "\"", + "enum": ["\"", "\\"] + }, + "table_3_csv_multiline": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "table_3_format": { + "enum": ["json", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nAllow multiline values?\nTable 3 - CSV Multiline", + "order": 35, + "default": "false", + "enum": ["true", "false"] + }, + "table_3_json_allow_comments": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "table_3_format": { + "enum": ["csv", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nAllow comments in JSON?\nTable 3 - JSON Allow Comments", + "order": 32, + "default": "true", + "enum": ["true", "false"] + }, + "table_3_json_allow_single_quotes": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "table_3_format": { + "enum": ["csv", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nAllow single quotes?\nTable 3 - JSON Allow Single Quotes", + "order": 33, + "default": "true", + "enum": ["true", "false"] + }, + "table_3_json_infer_timestamp": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "table_3_format": { + "enum": ["csv", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nInfer timestamp format?\nTable 3 - JSON Infer Timestamp", + "order": 34, + "default": "true", + "enum": ["true", "false"] + }, + "table_3_json_multiline": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "table_3_format": { + "enum": ["csv", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nAllow multiline JSON?\nTable 3 - JSON Multiline", + "order": 35, + "default": "true", + "enum": ["true", "false"] + }, + "table_3_schema_hints": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + } + ] + }, + "type": "string", + "description": "\n[Optional] Schema hints. Leave empty for auto-detection.\nTable 3 - Schema Hints", + "order": 36, + "default": "" + }, + "table_3_schema_evolution_mode": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + } + ] + }, + "type": "string", + "description": "\nSchema evolution mode\nTable 3 - Schema Evolution Mode", + "order": 37, + "default": "addNewColumns", + "enum": ["addNewColumns", "rescue", "none"] + }, + "add_table_4": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + } + ] + }, + "type": "string", + "description": "\nWould you like to add another table?\nAdd Table 4", + "order": 39, + "default": "no", + "enum": ["yes", "no"] + }, + "table_4_name": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_4": { + "const": "no" + } + } + } + ] + }, + "type": "string", + "description": "\n=== Table 4 Configuration ===\n\nTable name\nTable 4 - Name", + "order": 40, + "default": "table4", + "pattern": "^[a-z_][a-z0-9_-]{0,254}$", + "pattern_match_failure_message": "Table name must only consist of lowercase letters, numbers, underscores, and dashes." + }, + "table_4_format": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_4": { + "const": "no" + } + } + } + ] + }, + "type": "string", + "description": "\nFile format\nTable 4 - Format", + "order": 41, + "default": "parquet", + "enum": ["csv", "json", "avro", "parquet"] + }, + "table_4_csv_header": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_4": { + "const": "no" + } + } + }, + { + "properties": { + "table_4_format": { + "enum": ["json", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nDoes the CSV file have a header row?\nTable 4 - CSV Header", + "order": 42, + "default": "true", + "enum": ["true", "false"] + }, + "table_4_csv_delimiter": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_4": { + "const": "no" + } + } + }, + { + "properties": { + "table_4_format": { + "enum": ["json", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nField delimiter character\nTable 4 - CSV Delimiter", + "order": 43, + "default": ",", + "enum": [",", ";", "|", "\t"] + }, + "table_4_csv_escape": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_4": { + "const": "no" + } + } + }, + { + "properties": { + "table_4_format": { + "enum": ["json", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nEscape character\nTable 4 - CSV Escape", + "order": 44, + "default": "\"", + "enum": ["\"", "\\"] + }, + "table_4_csv_multiline": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_4": { + "const": "no" + } + } + }, + { + "properties": { + "table_4_format": { + "enum": ["json", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nAllow multiline values?\nTable 4 - CSV Multiline", + "order": 45, + "default": "false", + "enum": ["true", "false"] + }, + "table_4_json_allow_comments": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_4": { + "const": "no" + } + } + }, + { + "properties": { + "table_4_format": { + "enum": ["csv", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nAllow comments in JSON?\nTable 4 - JSON Allow Comments", + "order": 42, + "default": "true", + "enum": ["true", "false"] + }, + "table_4_json_allow_single_quotes": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_4": { + "const": "no" + } + } + }, + { + "properties": { + "table_4_format": { + "enum": ["csv", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nAllow single quotes?\nTable 4 - JSON Allow Single Quotes", + "order": 43, + "default": "true", + "enum": ["true", "false"] + }, + "table_4_json_infer_timestamp": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_4": { + "const": "no" + } + } + }, + { + "properties": { + "table_4_format": { + "enum": ["csv", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nInfer timestamp format?\nTable 4 - JSON Infer Timestamp", + "order": 44, + "default": "true", + "enum": ["true", "false"] + }, + "table_4_json_multiline": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_4": { + "const": "no" + } + } + }, + { + "properties": { + "table_4_format": { + "enum": ["csv", "avro", "parquet"] + } + } + } + ] + }, + "type": "string", + "description": "\nAllow multiline JSON?\nTable 4 - JSON Multiline", + "order": 45, + "default": "true", + "enum": ["true", "false"] + }, + "table_4_schema_hints": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_4": { + "const": "no" + } + } + } + ] + }, + "type": "string", + "description": "\n[Optional] Schema hints. Leave empty for auto-detection.\nTable 4 - Schema Hints", + "order": 46, + "default": "" + }, + "table_4_schema_evolution_mode": { + "skip_prompt_if": { + "anyOf": [ + { + "properties": { + "add_table_2": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_3": { + "const": "no" + } + } + }, + { + "properties": { + "add_table_4": { + "const": "no" + } + } + } + ] + }, + "type": "string", + "description": "\nSchema evolution mode\nTable 4 - Schema Evolution Mode", + "order": 47, + "default": "addNewColumns", + "enum": ["addNewColumns", "rescue", "none"] } }, - "success_message": "\nBundle folder '{{.catalog_name}}.{{.schema_name}}' has been created. Please refer to the README.md for next steps." + "success_message": "\nBundle folder '{{.catalog_name}}.{{.schema_name}}' has been created with your configured tables.\n\nPlease refer to the README.md for next steps." } diff --git a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/README.md.tmpl b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/README.md.tmpl index 58b6fc2..de5e30e 100644 --- a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/README.md.tmpl +++ b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/README.md.tmpl @@ -18,31 +18,50 @@ A lightweight, no‑code file ingestion workflow. Configure a set of tables, get ## Quick Start -### Step 1. Configure tables -Edit table configs in `./src/configs/tables.json`. Only `name` and `format` are required. +### Step 1. Review table configuration +Your tables have been configured during initialization. You can review and modify them in `./src/configs/tables.json`. Currently supported formats are `csv`, `json`, `avro` and `parquet`. For supported `format_options`, see the [Auto Loader options](https://docs.databricks.com/aws/en/ingestion/cloud-object-storage/auto-loader/options). Not all options are supported here. If unsure, specify only `name` and `format`, or follow [Debug Table Issues](#debug-table-issues) to discover the correct options. +**Example table configuration:** ```json [ { "name": "table1", "format": "csv", - "format_options": - { - "escape": "\"" + "format_options": { + "header": "true", + "delimiter": ",", + "escape": "\"", + "multiLine": "false", + "cloudFiles.schemaEvolutionMode": "addNewColumns", + "cloudFiles.inferColumnTypes": "true" }, "schema_hints": "id int, name string" }, { "name": "table2", - "format": "json" + "format": "json", + "format_options": { + "allowComments": "true", + "allowSingleQuotes": "true", + "inferTimestamp": "true", + "multiLine": "true", + "cloudFiles.schemaEvolutionMode": "addNewColumns", + "cloudFiles.inferColumnTypes": "true" + } } ] ``` +**Supported format options by file type:** +- **CSV**: `header`, `delimiter`, `escape`, `multiLine` +- **JSON**: `allowComments`, `allowSingleQuotes`, `inferTimestamp`, `multiLine` +- **AVRO/PARQUET**: Use base options only +- **All formats**: `cloudFiles.schemaEvolutionMode`, `cloudFiles.inferColumnTypes` + > **Tip:** Keep `schema_hints` minimal; Auto Loader can evolve the schema as new columns appear. ### Step 2. Deploy & set up @@ -98,10 +117,10 @@ Within about a minute, the data should appear in the table `{{.catalog_name}}.{{ --- ## Debug Table Issues -If data isn’t parsed as expected, use **dev mode** to iterate on table options safely. +If data isn't parsed as expected, use **dev mode** to iterate on table options safely. ### Step 1. Configure tables to debug -Configure tables as in [Step 1 of Quick Start](#step-1-configure-tables). +Review or modify tables in `./src/configs/tables.json` as needed. ### Step 2. Deploy & set up in **dev mode** diff --git a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/configs/tables.json b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/configs/tables.json deleted file mode 100644 index 0a57f16..0000000 --- a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/configs/tables.json +++ /dev/null @@ -1,18 +0,0 @@ -[ - { - "name": "example_table_csv", - "format": "csv" - }, - { - "name": "example_table_json", - "format": "json" - }, - { - "name": "example_table_avro", - "format": "avro" - }, - { - "name": "example_table_parquet", - "format": "parquet" - } -] diff --git a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/configs/tables.json.tmpl b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/configs/tables.json.tmpl new file mode 100644 index 0000000..83a0d72 --- /dev/null +++ b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/configs/tables.json.tmpl @@ -0,0 +1,82 @@ +[ + { + "name": "{{.table_1_name}}", + "format": "{{.table_1_format}}", + "format_options": { +{{- if eq .table_1_format "csv"}} + "header": "{{.table_1_csv_header}}", + "delimiter": "{{if eq .table_1_csv_delimiter "\t"}}\\t{{else}}{{.table_1_csv_delimiter}}{{end}}", + "escape": "{{if eq .table_1_csv_escape "\""}}\"{{else if eq .table_1_csv_escape "\\"}}\\{{else}}{{.table_1_csv_escape}}{{end}}", + "multiLine": "{{.table_1_csv_multiline}}", +{{- else if eq .table_1_format "json"}} + "allowComments": "{{.table_1_json_allow_comments}}", + "allowSingleQuotes": "{{.table_1_json_allow_single_quotes}}", + "inferTimestamp": "{{.table_1_json_infer_timestamp}}", + "multiLine": "{{.table_1_json_multiline}}", +{{- end}} + "cloudFiles.schemaEvolutionMode": "{{.table_1_schema_evolution_mode}}" + }{{if .table_1_schema_hints}}, + "schema_hints": "{{.table_1_schema_hints}}"{{end}} + }{{if eq .add_table_2 "yes"}}, + { + "name": "{{.table_2_name}}", + "format": "{{.table_2_format}}", + "format_options": { +{{- if eq .table_2_format "csv"}} + "header": "{{.table_2_csv_header}}", + "delimiter": "{{if eq .table_2_csv_delimiter "\t"}}\\t{{else}}{{.table_2_csv_delimiter}}{{end}}", + "escape": "{{if eq .table_2_csv_escape "\""}}\"{{else if eq .table_2_csv_escape "\\"}}\\{{else}}{{.table_2_csv_escape}}{{end}}", + "multiLine": "{{.table_2_csv_multiline}}", +{{- else if eq .table_2_format "json"}} + "allowComments": "{{.table_2_json_allow_comments}}", + "allowSingleQuotes": "{{.table_2_json_allow_single_quotes}}", + "inferTimestamp": "{{.table_2_json_infer_timestamp}}", + "multiLine": "{{.table_2_json_multiline}}", +{{- end}} + "cloudFiles.schemaEvolutionMode": "{{.table_2_schema_evolution_mode}}" + }{{if .table_2_schema_hints}}, + "schema_hints": "{{.table_2_schema_hints}}"{{end}} + }{{end}}{{if and (eq .add_table_2 "yes") (eq .add_table_3 "yes")}}, + { + "name": "{{.table_3_name}}", + "format": "{{.table_3_format}}", + "format_options": { +{{- if eq .table_3_format "csv"}} + "header": "{{.table_3_csv_header}}", + "delimiter": "{{if eq .table_3_csv_delimiter "\t"}}\\t{{else}}{{.table_3_csv_delimiter}}{{end}}", + "escape": "{{if eq .table_3_csv_escape "\""}}\"{{else if eq .table_3_csv_escape "\\"}}\\{{else}}{{.table_3_csv_escape}}{{end}}", + "multiLine": "{{.table_3_csv_multiline}}", +{{- else if eq .table_3_format "json"}} + "allowComments": "{{.table_3_json_allow_comments}}", + "allowSingleQuotes": "{{.table_3_json_allow_single_quotes}}", + "inferTimestamp": "{{.table_3_json_infer_timestamp}}", + "multiLine": "{{.table_3_json_multiline}}", +{{- end}} + "cloudFiles.schemaEvolutionMode": "{{.table_3_schema_evolution_mode}}" + }{{if .table_3_schema_hints}}, + "schema_hints": "{{.table_3_schema_hints}}"{{end}} + }{{end}}{{if and (eq .add_table_2 "yes") (eq .add_table_3 "yes") (eq .add_table_4 "yes")}}, + { + "name": "{{.table_4_name}}", + "format": "{{.table_4_format}}", + "format_options": { +{{- if eq .table_4_format "csv"}} + "header": "{{.table_4_csv_header}}", + "delimiter": "{{if eq .table_4_csv_delimiter "\t"}}\\t{{else}}{{.table_4_csv_delimiter}}{{end}}", + "escape": "{{if eq .table_4_csv_escape "\""}}\"{{else if eq .table_4_csv_escape "\\"}}\\{{else}}{{.table_4_csv_escape}}{{end}}", + "multiLine": "{{.table_4_csv_multiline}}", +{{- else if eq .table_4_format "json"}} + "allowComments": "{{.table_4_json_allow_comments}}", + "allowSingleQuotes": "{{.table_4_json_allow_single_quotes}}", + "inferTimestamp": "{{.table_4_json_infer_timestamp}}", + "multiLine": "{{.table_4_json_multiline}}", +{{- end}} + "cloudFiles.schemaEvolutionMode": "{{.table_4_schema_evolution_mode}}" + }{{if .table_4_schema_hints}}, + "schema_hints": "{{.table_4_schema_hints}}"{{end}} + }{{end}} +] + + + + diff --git a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/envmanager.py b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/envmanager.py index 432c964..ee1e3c3 100644 --- a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/envmanager.py +++ b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/envmanager.py @@ -16,20 +16,20 @@ def get_config() -> dict: return configs -def has_default_storage() -> bool: - catalog = get_config()["catalog_name"] - - w = WorkspaceClient() +def has_default_storage( + catalog_name: str, workspace_client: WorkspaceClient = None +) -> bool: + w = workspace_client or WorkspaceClient() # Try SDK model first - info = w.catalogs.get(catalog) + info = w.catalogs.get(catalog_name) storage_root = getattr(info, "storage_root", None) storage_location = getattr(info, "storage_location", None) props = getattr(info, "properties", {}) or {} # Some workspaces expose fields only via raw JSON; fall back if all empty if not (storage_root or storage_location or props): - j = w.api_client.do("GET", f"/api/2.1/unity-catalog/catalogs/{catalog}") + j = w.api_client.do("GET", f"/api/2.1/unity-catalog/catalogs/{catalog_name}") storage_root = j.get("storage_root") or j.get("storageLocation") storage_location = j.get("storage_location") or j.get("storageLocation") props = j.get("properties", {}) or {} diff --git a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/formatmanager.py b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/formatmanager.py index 368109e..b85379c 100644 --- a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/formatmanager.py +++ b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/formatmanager.py @@ -128,17 +128,24 @@ def __init__(self): } -_supported_formats: dict[str, AutoLoaderFormat] = { - f.name: f for f in (CSV(), JSON(), AVRO(), PARQUET()) -} +# Cache the supported formats so they are only created on the first call +_supported_formats_cache = {} + + +def get_supported_formats() -> dict[str, AutoLoaderFormat]: + if not _supported_formats_cache: + _supported_formats_cache.update( + {f.name: f for f in (CSV(), JSON(), AVRO(), PARQUET())} + ) + return _supported_formats_cache def get_format_manager(fmt: str) -> dict[str, str]: key = fmt.strip().upper() try: - return _supported_formats[key] + return get_supported_formats()[key] except KeyError: - supported = ", ".join(sorted(_supported_formats)) + supported = ", ".join(sorted(get_supported_formats().keys())) raise ValueError( f"{fmt!r} is not a supported format. Supported formats: {supported}" ) diff --git a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/initialization.py b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/initialization.py index 602155c..5e7e723 100644 --- a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/initialization.py +++ b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/initialization.py @@ -1,7 +1,15 @@ +import sys +import os + +# Add parent directory to sys.path to import utils package +sys.path.insert(0, os.path.join(os.getcwd(), "..")) + from databricks.sdk import WorkspaceClient import argparse import json import logging +from utils import tablemanager +from utils import envmanager # Parse arguments parser = argparse.ArgumentParser() @@ -27,6 +35,45 @@ # Initialize workspace client ws = WorkspaceClient() +# Dump configs to environment json early +all_configs = { + "catalog_name": catalog_name, + "schema_name": schema_name, + "volume_path_root": volume_path_root, + "volume_path_data": volume_path_data, + "volume_path_archive": volume_path_archive, +} +with open("../configs/environment.json", "w") as f: + json.dump(all_configs, f) +logger.info("Environment configuration file created") + +# Load and validate table configurations early for fail-fast +logger.info("Loading and validating table configurations") +with open("../configs/tables.json", "r") as f: + table_configs = json.load(f) + +logger.debug(f"Found {len(table_configs)} table(s) in configuration") +try: + tablemanager.validate_configs(table_configs) + logger.info("All table configurations validated successfully") +except ValueError as e: + logger.error(f"Configuration validation failed: {e}") + raise + +# Check for default storage (recommended setting) +logger.info(f"Checking default storage configuration for catalog {catalog_name}") +try: + if not envmanager.has_default_storage(ws, catalog_name): + logger.warning( + f"Default storage is NOT enabled for catalog '{catalog_name}'. " + "It is recommended to enable default storage for the catalog to ensure " + "proper data management and storage configuration." + ) + else: + logger.info(f"Default storage is enabled for catalog '{catalog_name}'") +except Exception as e: + logger.warning(f"Could not verify default storage setting: {e}") + # Set property to schema logger.info(f"Setting property to schema {catalog_name}.{schema_name}") logger.debug(f"Volume path root: {volume_path_root}") @@ -47,28 +94,16 @@ ws.files.create_directory(volume_path_data) logger.debug(f"Creating archive directory {volume_path_archive}") ws.files.create_directory(volume_path_archive) -with open("../configs/tables.json", "r") as f: - for table in json.load(f): - table_volume_path_data = f"{volume_path_data}/{table['name']}" - logger.debug(f"Creating table directory {table_volume_path_data}") - ws.files.create_directory(table_volume_path_data) - table_volume_path_archive = f"{volume_path_archive}/{table['name']}" - logger.debug(f"Creating table archive directory {table_volume_path_archive}") - ws.files.create_directory(table_volume_path_archive) +for table in table_configs: + table_volume_path_data = f"{volume_path_data}/{table['name']}" + logger.debug(f"Creating table directory {table_volume_path_data}") + ws.files.create_directory(table_volume_path_data) + table_volume_path_archive = f"{volume_path_archive}/{table['name']}" + logger.debug(f"Creating table archive directory {table_volume_path_archive}") + ws.files.create_directory(table_volume_path_archive) logger.info(f"Volume {volume_path_root} configured") -# Dump configs to environment json -all_configs = { - "catalog_name": catalog_name, - "schema_name": schema_name, - "volume_path_root": volume_path_root, - "volume_path_data": volume_path_data, - "volume_path_archive": volume_path_archive, -} -with open("../configs/environment.json", "w") as f: - json.dump(all_configs, f) - logger.info( - f"==========\n%s\n==========", + "==========\n%s\n==========", "\n".join(f"{k}: {v}" for k, v in all_configs.items()), ) diff --git a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/tablemanager.py b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/tablemanager.py index c411bb1..f5ac94e 100644 --- a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/tablemanager.py +++ b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/utils/tablemanager.py @@ -1,5 +1,6 @@ import os import json +import re from . import envmanager from . import formatmanager from pyspark.sql.streaming import DataStreamReader @@ -9,11 +10,35 @@ def validate_config(table_config: dict): + # Required fields if not table_config.get("name"): raise ValueError("name is required for table config") if not table_config.get("format"): raise ValueError("format is required for table config") + # Validate table name characters (Databricks naming convention) + table_name = table_config.get("name") + if not re.match(r"^[a-z0-9_-]+$", table_name): + raise ValueError( + f"Table name '{table_name}' contains unsupported characters. " + "Table names must only consist of lowercase letters, numbers, underscores, and dashes." + ) + + # Validate format is supported + fmt = table_config.get("format") + try: + fmt_mgr = formatmanager.get_format_manager(fmt) + except ValueError as e: + raise ValueError(f"Unsupported format for table '{table_name}': {e}") + + # Validate format options (check for blocklisted/hidden options) + format_options = table_config.get("format_options", {}) + if format_options: + try: + fmt_mgr.validate_user_options(format_options) + except ValueError as e: + raise ValueError(f"Invalid format options for table '{table_name}': {e}") + def validate_configs(table_configs: list): names = [cfg.get("name") for cfg in table_configs]