diff --git a/dataflow/DistinctRowsAll.json b/dataflow/DistinctRowsAll.json new file mode 100644 index 0000000..a57b750 --- /dev/null +++ b/dataflow/DistinctRowsAll.json @@ -0,0 +1,54 @@ +{ + "name": "DistinctRowsAll", + "properties": { + "description": "Distinct Rows using fuzzy, key, all cols", + "type": "MappingDataFlow", + "typeProperties": { + "sources": [ + { + "dataset": { + "referenceName": "MoviesD2", + "type": "DatasetReference" + }, + "name": "MoviesCSV" + } + ], + "sinks": [ + { + "dataset": { + "referenceName": "folderout", + "type": "DatasetReference" + }, + "name": "OutputDistinctData" + } + ], + "transformations": [ + { + "name": "DistinctRows" + }, + { + "name": "RowCountDistinct" + }, + { + "name": "OriginalData" + }, + { + "name": "RowCountOrig" + }, + { + "name": "DistinctAllCols" + }, + { + "name": "FuzzyMatch" + }, + { + "name": "RowCountAll" + }, + { + "name": "RowCountFuzzy" + } + ], + "script": "source(output(\n\t\tmovie as string,\n\t\ttitle as string,\n\t\tgenres as string,\n\t\tyear as string,\n\t\tRating as string,\n\t\tRottenTomato as string\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\tignoreNoFilesFound: false) ~> MoviesCSV\nMoviesCSV aggregate(groupBy(movie),\n\teach(match(name!='movie'), $$ = first($$))) ~> DistinctRows\nDistinctRows aggregate(rowcount_agg = count(1)) ~> RowCountDistinct\nMoviesCSV select(mapColumn(\n\t\tmovie,\n\t\ttitle,\n\t\tgenres,\n\t\tyear,\n\t\tRating,\n\t\tRottenTomato\n\t),\n\tskipDuplicateMapInputs: false,\n\tskipDuplicateMapOutputs: false) ~> OriginalData\nOriginalData aggregate(rowcount_orig = count(1)) ~> RowCountOrig\nMoviesCSV aggregate(groupBy(mycols = sha2(256,columns())),\n\teach(match(true()), $$ = first($$))) ~> DistinctAllCols\nMoviesCSV aggregate(groupBy(colsfuzzy = sha2(256,movie,year,soundex(title))),\n\teach(match(true()), $$ = first($$))) ~> FuzzyMatch\nDistinctAllCols aggregate(rowcountall = count(1)) ~> RowCountAll\nFuzzyMatch aggregate(rowcountfuzzy = count(1)) ~> RowCountFuzzy\nDistinctRows sink(allowSchemaDrift: true,\n\tvalidateSchema: false,\n\tpartitionBy('hash', 1)) ~> OutputDistinctData" + } + } +} \ No newline at end of file diff --git a/dataflow/DistinctRowsAll1.json b/dataflow/DistinctRowsAll1.json new file mode 100644 index 0000000..2dd68cf --- /dev/null +++ b/dataflow/DistinctRowsAll1.json @@ -0,0 +1,54 @@ +{ + "name": "DistinctRowsAll1", + "properties": { + "description": "Distinct Rows using fuzzy, key, all cols", + "type": "MappingDataFlow", + "typeProperties": { + "sources": [ + { + "dataset": { + "referenceName": "MoviesD2", + "type": "DatasetReference" + }, + "name": "MoviesCSV" + } + ], + "sinks": [ + { + "dataset": { + "referenceName": "folderout", + "type": "DatasetReference" + }, + "name": "OutputDistinctData" + } + ], + "transformations": [ + { + "name": "DistinctRows" + }, + { + "name": "RowCountDistinct" + }, + { + "name": "OriginalData" + }, + { + "name": "RowCountOrig" + }, + { + "name": "DistinctAllCols" + }, + { + "name": "FuzzyMatch" + }, + { + "name": "RowCountAll" + }, + { + "name": "RowCountFuzzy" + } + ], + "script": "source(output(\n\t\tmovie as string,\n\t\ttitle as string,\n\t\tgenres as string,\n\t\tyear as string,\n\t\tRating as string,\n\t\tRottenTomato as string\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\tignoreNoFilesFound: false) ~> MoviesCSV\nMoviesCSV aggregate(groupBy(movie),\n\teach(match(name!='movie'), $$ = first($$))) ~> DistinctRows\nDistinctRows aggregate(rowcount_agg = count(1)) ~> RowCountDistinct\nMoviesCSV select(mapColumn(\n\t\tmovie,\n\t\ttitle,\n\t\tgenres,\n\t\tyear,\n\t\tRating,\n\t\tRottenTomato\n\t),\n\tskipDuplicateMapInputs: false,\n\tskipDuplicateMapOutputs: false) ~> OriginalData\nOriginalData aggregate(rowcount_orig = count(1)) ~> RowCountOrig\nMoviesCSV aggregate(groupBy(mycols = sha2(256,columns())),\n\teach(match(true()), $$ = first($$))) ~> DistinctAllCols\nMoviesCSV aggregate(groupBy(colsfuzzy = sha2(256,movie,year,soundex(title))),\n\teach(match(true()), $$ = first($$))) ~> FuzzyMatch\nDistinctAllCols aggregate(rowcountall = count(1)) ~> RowCountAll\nFuzzyMatch aggregate(rowcountfuzzy = count(1)) ~> RowCountFuzzy\nDistinctRows sink(allowSchemaDrift: true,\n\tvalidateSchema: false,\n\tpartitionBy('hash', 1)) ~> OutputDistinctData" + } + } +} \ No newline at end of file diff --git a/dataflow/DistinctRowsAll_copy1.json b/dataflow/DistinctRowsAll_copy1.json new file mode 100644 index 0000000..cfeea76 --- /dev/null +++ b/dataflow/DistinctRowsAll_copy1.json @@ -0,0 +1,54 @@ +{ + "name": "DistinctRowsAll_copy1", + "properties": { + "description": "Distinct Rows using fuzzy, key, all cols", + "type": "MappingDataFlow", + "typeProperties": { + "sources": [ + { + "dataset": { + "referenceName": "MoviesD2", + "type": "DatasetReference" + }, + "name": "MoviesCSV" + } + ], + "sinks": [ + { + "dataset": { + "referenceName": "folderout", + "type": "DatasetReference" + }, + "name": "OutputDistinctData" + } + ], + "transformations": [ + { + "name": "DistinctRows" + }, + { + "name": "RowCountDistinct" + }, + { + "name": "OriginalData" + }, + { + "name": "RowCountOrig" + }, + { + "name": "DistinctAllCols" + }, + { + "name": "FuzzyMatch" + }, + { + "name": "RowCountAll" + }, + { + "name": "RowCountFuzzy" + } + ], + "script": "source(output(\n\t\tmovie as string,\n\t\ttitle as string,\n\t\tgenres as string,\n\t\tyear as string,\n\t\tRating as string,\n\t\tRottenTomato as string\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\tignoreNoFilesFound: false) ~> MoviesCSV\nMoviesCSV aggregate(groupBy(movie),\n\teach(match(name!='movie'), $$ = first($$))) ~> DistinctRows\nDistinctRows aggregate(rowcount_agg = count(1)) ~> RowCountDistinct\nMoviesCSV select(mapColumn(\n\t\tmovie,\n\t\ttitle,\n\t\tgenres,\n\t\tyear,\n\t\tRating,\n\t\tRottenTomato\n\t),\n\tskipDuplicateMapInputs: false,\n\tskipDuplicateMapOutputs: false) ~> OriginalData\nOriginalData aggregate(rowcount_orig = count(1)) ~> RowCountOrig\nMoviesCSV aggregate(groupBy(mycols = sha2(256,columns())),\n\teach(match(true()), $$ = first($$))) ~> DistinctAllCols\nMoviesCSV aggregate(groupBy(colsfuzzy = sha2(256,movie,year,soundex(title))),\n\teach(match(true()), $$ = first($$))) ~> FuzzyMatch\nDistinctAllCols aggregate(rowcountall = count(1)) ~> RowCountAll\nFuzzyMatch aggregate(rowcountfuzzy = count(1)) ~> RowCountFuzzy\nDistinctRows sink(allowSchemaDrift: true,\n\tvalidateSchema: false,\n\tpartitionBy('hash', 1)) ~> OutputDistinctData" + } + } +} \ No newline at end of file diff --git a/dataset/MoviesD2.json b/dataset/MoviesD2.json new file mode 100644 index 0000000..4c438b9 --- /dev/null +++ b/dataset/MoviesD2.json @@ -0,0 +1,48 @@ +{ + "name": "MoviesD2", + "properties": { + "linkedServiceName": { + "referenceName": "DAzureBlobStorage1", + "type": "LinkedServiceReference" + }, + "annotations": [], + "type": "DelimitedText", + "typeProperties": { + "location": { + "type": "AzureBlobStorageLocation", + "fileName": "moviesDB2.csv", + "container": "csv" + }, + "columnDelimiter": ",", + "escapeChar": "\\", + "firstRowAsHeader": true, + "quoteChar": "\"" + }, + "schema": [ + { + "name": "movie", + "type": "String" + }, + { + "name": "title", + "type": "String" + }, + { + "name": "genres", + "type": "String" + }, + { + "name": "year", + "type": "String" + }, + { + "name": "Rating", + "type": "String" + }, + { + "name": "RottenTomato", + "type": "String" + } + ] + } +} \ No newline at end of file diff --git a/dataset/folderout.json b/dataset/folderout.json new file mode 100644 index 0000000..2d04b00 --- /dev/null +++ b/dataset/folderout.json @@ -0,0 +1,22 @@ +{ + "name": "folderout", + "properties": { + "linkedServiceName": { + "referenceName": "DAzureBlobStorage1", + "type": "LinkedServiceReference" + }, + "annotations": [], + "type": "DelimitedText", + "typeProperties": { + "location": { + "type": "AzureBlobStorageLocation", + "container": "data" + }, + "columnDelimiter": ",", + "escapeChar": "\\", + "firstRowAsHeader": false, + "quoteChar": "\"" + }, + "schema": [] + } +} \ No newline at end of file diff --git a/linkedService/AzureDataLakeStorage1.json b/linkedService/AzureDataLakeStorage1.json new file mode 100644 index 0000000..1146a2c --- /dev/null +++ b/linkedService/AzureDataLakeStorage1.json @@ -0,0 +1,12 @@ +{ + "name": "AzureDataLakeStorage1", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "annotations": [], + "type": "AzureBlobFS", + "typeProperties": { + "url": "https://glukstoacc.dfs.core.windows.net/", + "encryptedCredential": "ew0KICAiVmVyc2lvbiI6ICIyMDE3LTExLTMwIiwNCiAgIlByb3RlY3Rpb25Nb2RlIjogIktleSIsDQogICJTZWNyZXRDb250ZW50VHlwZSI6ICJQbGFpbnRleHQiLA0KICAiQ3JlZGVudGlhbElkIjogIkRBVEFGQUNUT1JZQEY1QzY0MjM0LTRCNzktNDE0Qi1CQTA5LTdGRTk0NTBBMzU5RF9lNTg4YjNlYy0zODMzLTQ1OWMtYWRlZi05NjIxNGU2OTM4NGQiDQp9" + } + } +} \ No newline at end of file diff --git a/linkedService/DAzureBlobStorage1.json b/linkedService/DAzureBlobStorage1.json new file mode 100644 index 0000000..57d3a2f --- /dev/null +++ b/linkedService/DAzureBlobStorage1.json @@ -0,0 +1,12 @@ +{ + "name": "DAzureBlobStorage1", + "type": "Microsoft.DataFactory/factories/linkedservices", + "properties": { + "annotations": [], + "type": "AzureBlobStorage", + "typeProperties": { + "connectionString": "DefaultEndpointsProtocol=https;AccountName=glukstoacc;EndpointSuffix=core.windows.net;", + "encryptedCredential": "ew0KICAiVmVyc2lvbiI6ICIyMDE3LTExLTMwIiwNCiAgIlByb3RlY3Rpb25Nb2RlIjogIktleSIsDQogICJTZWNyZXRDb250ZW50VHlwZSI6ICJQbGFpbnRleHQiLA0KICAiQ3JlZGVudGlhbElkIjogIkRBVEFGQUNUT1JZQEY1QzY0MjM0LTRCNzktNDE0Qi1CQTA5LTdGRTk0NTBBMzU5RF9lMDA4NmI4OC03MWE2LTQ3YTYtYWRlMy00ZmVlM2VlM2ViMmIiDQp9" + } + } +} \ No newline at end of file diff --git a/pipeline/DistinctRows.json b/pipeline/DistinctRows.json new file mode 100644 index 0000000..fcc7cbb --- /dev/null +++ b/pipeline/DistinctRows.json @@ -0,0 +1,33 @@ +{ + "name": "DistinctRows", + "properties": { + "description": "Many examples of shareable data flows for distinct rows and deduping data", + "activities": [ + { + "name": "DistinctRows", + "type": "ExecuteDataFlow", + "dependsOn": [], + "policy": { + "timeout": "7.00:00:00", + "retry": 0, + "retryIntervalInSeconds": 30, + "secureOutput": false, + "secureInput": false + }, + "userProperties": [], + "typeProperties": { + "dataflow": { + "referenceName": "DistinctRowsAll", + "type": "DataFlowReference" + }, + "compute": { + "coreCount": 8, + "computeType": "General" + }, + "traceLevel": "Fine" + } + } + ], + "annotations": [] + } +} \ No newline at end of file diff --git a/pipeline/DistinctRows1.json b/pipeline/DistinctRows1.json new file mode 100644 index 0000000..4d6985d --- /dev/null +++ b/pipeline/DistinctRows1.json @@ -0,0 +1,33 @@ +{ + "name": "DistinctRows1", + "properties": { + "description": "Many examples of shareable data flows for distinct rows and deduping data", + "activities": [ + { + "name": "DistinctRows", + "type": "ExecuteDataFlow", + "dependsOn": [], + "policy": { + "timeout": "7.00:00:00", + "retry": 0, + "retryIntervalInSeconds": 30, + "secureOutput": false, + "secureInput": false + }, + "userProperties": [], + "typeProperties": { + "dataflow": { + "referenceName": "DistinctRowsAll1", + "type": "DataFlowReference" + }, + "compute": { + "coreCount": 8, + "computeType": "General" + }, + "traceLevel": "Fine" + } + } + ], + "annotations": [] + } +} \ No newline at end of file diff --git a/publish_config.json b/publish_config.json new file mode 100644 index 0000000..3c723d0 --- /dev/null +++ b/publish_config.json @@ -0,0 +1 @@ +{"publishBranch":"adf_publish"} \ No newline at end of file diff --git a/templates/Distinct Rows All/Distinct Rows All.json b/templates/Distinct Rows All/Distinct Rows All.json new file mode 100644 index 0000000..1c0d84b --- /dev/null +++ b/templates/Distinct Rows All/Distinct Rows All.json @@ -0,0 +1,193 @@ +{ + "$schema": "http://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "factoryName": { + "type": "string", + "metadata": "Data Factory name" + }, + "AzureBlobStorage1": { + "type": "string" + } + }, + "variables": { + "factoryId": "[concat('Microsoft.DataFactory/factories/', parameters('factoryName'))]" + }, + "resources": [ + { + "name": "[concat(parameters('factoryName'), '/DistinctRows')]", + "type": "Microsoft.DataFactory/factories/pipelines", + "apiVersion": "2018-06-01", + "properties": { + "description": "Many examples of shareable data flows for distinct rows and deduping data", + "activities": [ + { + "name": "DistinctRows", + "type": "ExecuteDataFlow", + "dependsOn": [], + "policy": { + "timeout": "7.00:00:00", + "retry": 0, + "retryIntervalInSeconds": 30, + "secureOutput": false, + "secureInput": false + }, + "userProperties": [], + "typeProperties": { + "dataflow": { + "referenceName": "DistinctRowsAll", + "type": "DataFlowReference" + }, + "integrationRuntime": { + "referenceName": "dataflowcluster", + "type": "IntegrationRuntimeReference" + } + } + } + ], + "annotations": [] + }, + "dependsOn": [ + "[concat(variables('factoryId'), '/dataflows/DistinctRowsAll')]" + ] + }, + { + "name": "[concat(parameters('factoryName'), '/DistinctRowsAll')]", + "type": "Microsoft.DataFactory/factories/dataflows", + "apiVersion": "2018-06-01", + "properties": { + "description": "Distinct Rows using fuzzy, key, all cols", + "type": "MappingDataFlow", + "typeProperties": { + "sources": [ + { + "dataset": { + "referenceName": "MoviesD2", + "type": "DatasetReference" + }, + "name": "MoviesCSV" + } + ], + "sinks": [ + { + "dataset": { + "referenceName": "folderout", + "type": "DatasetReference" + }, + "name": "OutputDistinctData" + } + ], + "transformations": [ + { + "name": "DistinctRows" + }, + { + "name": "RowCountDistinct" + }, + { + "name": "OriginalData" + }, + { + "name": "RowCountOrig" + }, + { + "name": "DistinctAllCols" + }, + { + "name": "FuzzyMatch" + }, + { + "name": "RowCountAll" + }, + { + "name": "RowCountFuzzy" + } + ], + "script": "source(output(\n\t\tmovie as string,\n\t\ttitle as string,\n\t\tgenres as string,\n\t\tyear as string,\n\t\tRating as string,\n\t\tRottenTomato as string\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false) ~> MoviesCSV\nMoviesCSV aggregate(groupBy(movie),\n\teach(match(name!='movie'), $$ = first($$))) ~> DistinctRows\nDistinctRows aggregate(rowcount_agg = count(1)) ~> RowCountDistinct\nMoviesCSV select(mapColumn(\n\t\tmovie,\n\t\ttitle,\n\t\tgenres,\n\t\tyear,\n\t\tRating,\n\t\tRottenTomato\n\t),\n\tskipDuplicateMapInputs: false,\n\tskipDuplicateMapOutputs: false) ~> OriginalData\nOriginalData aggregate(rowcount_orig = count(1)) ~> RowCountOrig\nMoviesCSV aggregate(groupBy(mycols = sha2(256,columns())),\n\teach(match(true()), $$ = first($$))) ~> DistinctAllCols\nMoviesCSV aggregate(groupBy(colsfuzzy = sha2(256,movie,year,soundex(title))),\n\teach(match(true()), $$ = first($$))) ~> FuzzyMatch\nDistinctAllCols aggregate(rowcountall = count(1)) ~> RowCountAll\nFuzzyMatch aggregate(rowcountfuzzy = count(1)) ~> RowCountFuzzy\nDistinctRows sink(allowSchemaDrift: true,\n\tvalidateSchema: false,\n\tpartitionBy('hash', 1)) ~> OutputDistinctData" + } + }, + "dependsOn": [ + "[concat(variables('factoryId'), '/datasets/MoviesD2')]", + "[concat(variables('factoryId'), '/datasets/folderout')]" + ] + }, + { + "name": "[concat(parameters('factoryName'), '/MoviesD2')]", + "type": "Microsoft.DataFactory/factories/datasets", + "apiVersion": "2018-06-01", + "properties": { + "linkedServiceName": { + "referenceName": "[parameters('AzureBlobStorage1')]", + "type": "LinkedServiceReference" + }, + "annotations": [], + "type": "DelimitedText", + "typeProperties": { + "location": { + "type": "AzureBlobStorageLocation", + "fileName": "moviesDB.csv", + "folderPath": "SampleData", + "container": "mycontainer" + }, + "columnDelimiter": ",", + "escapeChar": "\\", + "firstRowAsHeader": true, + "quoteChar": "\"" + }, + "schema": [ + { + "name": "movie", + "type": "String" + }, + { + "name": "title", + "type": "String" + }, + { + "name": "genres", + "type": "String" + }, + { + "name": "year", + "type": "String" + }, + { + "name": "Rating", + "type": "String" + }, + { + "name": "RottenTomato", + "type": "String" + } + ] + }, + "dependsOn": [] + }, + { + "name": "[concat(parameters('factoryName'), '/folderout')]", + "type": "Microsoft.DataFactory/factories/datasets", + "apiVersion": "2018-06-01", + "properties": { + "linkedServiceName": { + "referenceName": "[parameters('AzureBlobStorage1')]", + "type": "LinkedServiceReference" + }, + "annotations": [], + "type": "DelimitedText", + "typeProperties": { + "location": { + "type": "AzureBlobStorageLocation", + "folderPath": "output/demoout1", + "container": "mycontainer" + }, + "columnDelimiter": ",", + "escapeChar": "\\", + "firstRowAsHeader": false, + "quoteChar": "\"" + }, + "schema": [] + }, + "dependsOn": [] + } + ] +} \ No newline at end of file diff --git a/templates/Distinct Rows All/manifest.json b/templates/Distinct Rows All/manifest.json new file mode 100644 index 0000000..bf2d3b9 --- /dev/null +++ b/templates/Distinct Rows All/manifest.json @@ -0,0 +1,28 @@ +{ + "name": "Distinct Rows All", + "description": "Many examples of shareable data flows for distinct rows and deduping data", + "image": "Mapping Data FlowDistinctRows", + "icons": [ + "ExecuteDataFlow", + "DelimitedText", + "DelimitedText" + ], + "requires": { + "linkedservices": { + "AzureBlobStorage1": { + "supportTypes": [ + "AzureBlobStorage" + ] + } + } + }, + "documentation": "http://youtu.be/QOi26ETtPTw?hd=1", + "author": "dipak.giri@gauri.com", + "annotations": [ + "data flows" + ], + "services": [ + "Azure Data Factory" + ], + "categories": [] +} \ No newline at end of file