kromerm · dmgiri · Jul 23, 2022 · Jul 23, 2022 · Jul 23, 2022 · Jul 23, 2022
diff --git a/dataflow/DistinctRowsAll.json b/dataflow/DistinctRowsAll.json
@@ -0,0 +1,54 @@
+{
+	"name": "DistinctRowsAll",
+	"properties": {
+		"description": "Distinct Rows using fuzzy, key, all cols",
+		"type": "MappingDataFlow",
+		"typeProperties": {
+			"sources": [
+				{
+					"dataset": {
+						"referenceName": "MoviesD2",
+						"type": "DatasetReference"
+					},
+					"name": "MoviesCSV"
+				}
+			],
+			"sinks": [
+				{
+					"dataset": {
+						"referenceName": "folderout",
+						"type": "DatasetReference"
+					},
+					"name": "OutputDistinctData"
+				}
+			],
+			"transformations": [
+				{
+					"name": "DistinctRows"
+				},
+				{
+					"name": "RowCountDistinct"
+				},
+				{
+					"name": "OriginalData"
+				},
+				{
+					"name": "RowCountOrig"
+				},
+				{
+					"name": "DistinctAllCols"
+				},
+				{
+					"name": "FuzzyMatch"
+				},
+				{
+					"name": "RowCountAll"
+				},
+				{
+					"name": "RowCountFuzzy"
+				}
+			],
+			"script": "source(output(\n\t\tmovie as string,\n\t\ttitle as string,\n\t\tgenres as string,\n\t\tyear as string,\n\t\tRating as string,\n\t\tRottenTomato as string\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\tignoreNoFilesFound: false) ~> MoviesCSV\nMoviesCSV aggregate(groupBy(movie),\n\teach(match(name!='movie'), $$ = first($$))) ~> DistinctRows\nDistinctRows aggregate(rowcount_agg = count(1)) ~> RowCountDistinct\nMoviesCSV select(mapColumn(\n\t\tmovie,\n\t\ttitle,\n\t\tgenres,\n\t\tyear,\n\t\tRating,\n\t\tRottenTomato\n\t),\n\tskipDuplicateMapInputs: false,\n\tskipDuplicateMapOutputs: false) ~> OriginalData\nOriginalData aggregate(rowcount_orig = count(1)) ~> RowCountOrig\nMoviesCSV aggregate(groupBy(mycols = sha2(256,columns())),\n\teach(match(true()), $$ = first($$))) ~> DistinctAllCols\nMoviesCSV aggregate(groupBy(colsfuzzy = sha2(256,movie,year,soundex(title))),\n\teach(match(true()), $$ = first($$))) ~> FuzzyMatch\nDistinctAllCols aggregate(rowcountall = count(1)) ~> RowCountAll\nFuzzyMatch aggregate(rowcountfuzzy = count(1)) ~> RowCountFuzzy\nDistinctRows sink(allowSchemaDrift: true,\n\tvalidateSchema: false,\n\tpartitionBy('hash', 1)) ~> OutputDistinctData"
+		}
+	}
+}
diff --git a/dataflow/DistinctRowsAll1.json b/dataflow/DistinctRowsAll1.json
@@ -0,0 +1,54 @@
+{
+	"name": "DistinctRowsAll1",
+	"properties": {
+		"description": "Distinct Rows using fuzzy, key, all cols",
+		"type": "MappingDataFlow",
+		"typeProperties": {
+			"sources": [
+				{
+					"dataset": {
+						"referenceName": "MoviesD2",
+						"type": "DatasetReference"
+					},
+					"name": "MoviesCSV"
+				}
+			],
+			"sinks": [
+				{
+					"dataset": {
+						"referenceName": "folderout",
+						"type": "DatasetReference"
+					},
+					"name": "OutputDistinctData"
+				}
+			],
+			"transformations": [
+				{
+					"name": "DistinctRows"
+				},
+				{
+					"name": "RowCountDistinct"
+				},
+				{
+					"name": "OriginalData"
+				},
+				{
+					"name": "RowCountOrig"
+				},
+				{
+					"name": "DistinctAllCols"
+				},
+				{
+					"name": "FuzzyMatch"
+				},
+				{
+					"name": "RowCountAll"
+				},
+				{
+					"name": "RowCountFuzzy"
+				}
+			],
+			"script": "source(output(\n\t\tmovie as string,\n\t\ttitle as string,\n\t\tgenres as string,\n\t\tyear as string,\n\t\tRating as string,\n\t\tRottenTomato as string\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\tignoreNoFilesFound: false) ~> MoviesCSV\nMoviesCSV aggregate(groupBy(movie),\n\teach(match(name!='movie'), $$ = first($$))) ~> DistinctRows\nDistinctRows aggregate(rowcount_agg = count(1)) ~> RowCountDistinct\nMoviesCSV select(mapColumn(\n\t\tmovie,\n\t\ttitle,\n\t\tgenres,\n\t\tyear,\n\t\tRating,\n\t\tRottenTomato\n\t),\n\tskipDuplicateMapInputs: false,\n\tskipDuplicateMapOutputs: false) ~> OriginalData\nOriginalData aggregate(rowcount_orig = count(1)) ~> RowCountOrig\nMoviesCSV aggregate(groupBy(mycols = sha2(256,columns())),\n\teach(match(true()), $$ = first($$))) ~> DistinctAllCols\nMoviesCSV aggregate(groupBy(colsfuzzy = sha2(256,movie,year,soundex(title))),\n\teach(match(true()), $$ = first($$))) ~> FuzzyMatch\nDistinctAllCols aggregate(rowcountall = count(1)) ~> RowCountAll\nFuzzyMatch aggregate(rowcountfuzzy = count(1)) ~> RowCountFuzzy\nDistinctRows sink(allowSchemaDrift: true,\n\tvalidateSchema: false,\n\tpartitionBy('hash', 1)) ~> OutputDistinctData"
+		}
+	}
+}
diff --git a/dataflow/DistinctRowsAll_copy1.json b/dataflow/DistinctRowsAll_copy1.json
@@ -0,0 +1,54 @@
+{
+	"name": "DistinctRowsAll_copy1",
+	"properties": {
+		"description": "Distinct Rows using fuzzy, key, all cols",
+		"type": "MappingDataFlow",
+		"typeProperties": {
+			"sources": [
+				{
+					"dataset": {
+						"referenceName": "MoviesD2",
+						"type": "DatasetReference"
+					},
+					"name": "MoviesCSV"
+				}
+			],
+			"sinks": [
+				{
+					"dataset": {
+						"referenceName": "folderout",
+						"type": "DatasetReference"
+					},
+					"name": "OutputDistinctData"
+				}
+			],
+			"transformations": [
+				{
+					"name": "DistinctRows"
+				},
+				{
+					"name": "RowCountDistinct"
+				},
+				{
+					"name": "OriginalData"
+				},
+				{
+					"name": "RowCountOrig"
+				},
+				{
+					"name": "DistinctAllCols"
+				},
+				{
+					"name": "FuzzyMatch"
+				},
+				{
+					"name": "RowCountAll"
+				},
+				{
+					"name": "RowCountFuzzy"
+				}
+			],
+			"script": "source(output(\n\t\tmovie as string,\n\t\ttitle as string,\n\t\tgenres as string,\n\t\tyear as string,\n\t\tRating as string,\n\t\tRottenTomato as string\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\tignoreNoFilesFound: false) ~> MoviesCSV\nMoviesCSV aggregate(groupBy(movie),\n\teach(match(name!='movie'), $$ = first($$))) ~> DistinctRows\nDistinctRows aggregate(rowcount_agg = count(1)) ~> RowCountDistinct\nMoviesCSV select(mapColumn(\n\t\tmovie,\n\t\ttitle,\n\t\tgenres,\n\t\tyear,\n\t\tRating,\n\t\tRottenTomato\n\t),\n\tskipDuplicateMapInputs: false,\n\tskipDuplicateMapOutputs: false) ~> OriginalData\nOriginalData aggregate(rowcount_orig = count(1)) ~> RowCountOrig\nMoviesCSV aggregate(groupBy(mycols = sha2(256,columns())),\n\teach(match(true()), $$ = first($$))) ~> DistinctAllCols\nMoviesCSV aggregate(groupBy(colsfuzzy = sha2(256,movie,year,soundex(title))),\n\teach(match(true()), $$ = first($$))) ~> FuzzyMatch\nDistinctAllCols aggregate(rowcountall = count(1)) ~> RowCountAll\nFuzzyMatch aggregate(rowcountfuzzy = count(1)) ~> RowCountFuzzy\nDistinctRows sink(allowSchemaDrift: true,\n\tvalidateSchema: false,\n\tpartitionBy('hash', 1)) ~> OutputDistinctData"
+		}
+	}
+}
diff --git a/dataset/MoviesD2.json b/dataset/MoviesD2.json
@@ -0,0 +1,48 @@
+{
+	"name": "MoviesD2",
+	"properties": {
+		"linkedServiceName": {
+			"referenceName": "DAzureBlobStorage1",
+			"type": "LinkedServiceReference"
+		},
+		"annotations": [],
+		"type": "DelimitedText",
+		"typeProperties": {
+			"location": {
+				"type": "AzureBlobStorageLocation",
+				"fileName": "moviesDB2.csv",
+				"container": "csv"
+			},
+			"columnDelimiter": ",",
+			"escapeChar": "\\",
+			"firstRowAsHeader": true,
+			"quoteChar": "\""
+		},
+		"schema": [
+			{
+				"name": "movie",
+				"type": "String"
+			},
+			{
+				"name": "title",
+				"type": "String"
+			},
+			{
+				"name": "genres",
+				"type": "String"
+			},
+			{
+				"name": "year",
+				"type": "String"
+			},
+			{
+				"name": "Rating",
+				"type": "String"
+			},
+			{
+				"name": "RottenTomato",
+				"type": "String"
+			}
+		]
+	}
+}
diff --git a/dataset/folderout.json b/dataset/folderout.json
@@ -0,0 +1,22 @@
+{
+	"name": "folderout",
+	"properties": {
+		"linkedServiceName": {
+			"referenceName": "DAzureBlobStorage1",
+			"type": "LinkedServiceReference"
+		},
+		"annotations": [],
+		"type": "DelimitedText",
+		"typeProperties": {
+			"location": {
+				"type": "AzureBlobStorageLocation",
+				"container": "data"
+			},
+			"columnDelimiter": ",",
+			"escapeChar": "\\",
+			"firstRowAsHeader": false,
+			"quoteChar": "\""
+		},
+		"schema": []
+	}
+}
diff --git a/linkedService/AzureDataLakeStorage1.json b/linkedService/AzureDataLakeStorage1.json
@@ -0,0 +1,12 @@
+{
+	"name": "AzureDataLakeStorage1",
+	"type": "Microsoft.DataFactory/factories/linkedservices",
+	"properties": {
+		"annotations": [],
+		"type": "AzureBlobFS",
+		"typeProperties": {
+			"url": "https://glukstoacc.dfs.core.windows.net/",
+			"encryptedCredential": "ew0KICAiVmVyc2lvbiI6ICIyMDE3LTExLTMwIiwNCiAgIlByb3RlY3Rpb25Nb2RlIjogIktleSIsDQogICJTZWNyZXRDb250ZW50VHlwZSI6ICJQbGFpbnRleHQiLA0KICAiQ3JlZGVudGlhbElkIjogIkRBVEFGQUNUT1JZQEY1QzY0MjM0LTRCNzktNDE0Qi1CQTA5LTdGRTk0NTBBMzU5RF9lNTg4YjNlYy0zODMzLTQ1OWMtYWRlZi05NjIxNGU2OTM4NGQiDQp9"
+		}
+	}
+}
diff --git a/linkedService/DAzureBlobStorage1.json b/linkedService/DAzureBlobStorage1.json
@@ -0,0 +1,12 @@
+{
+	"name": "DAzureBlobStorage1",
+	"type": "Microsoft.DataFactory/factories/linkedservices",
+	"properties": {
+		"annotations": [],
+		"type": "AzureBlobStorage",
+		"typeProperties": {
+			"connectionString": "DefaultEndpointsProtocol=https;AccountName=glukstoacc;EndpointSuffix=core.windows.net;",
+			"encryptedCredential": "ew0KICAiVmVyc2lvbiI6ICIyMDE3LTExLTMwIiwNCiAgIlByb3RlY3Rpb25Nb2RlIjogIktleSIsDQogICJTZWNyZXRDb250ZW50VHlwZSI6ICJQbGFpbnRleHQiLA0KICAiQ3JlZGVudGlhbElkIjogIkRBVEFGQUNUT1JZQEY1QzY0MjM0LTRCNzktNDE0Qi1CQTA5LTdGRTk0NTBBMzU5RF9lMDA4NmI4OC03MWE2LTQ3YTYtYWRlMy00ZmVlM2VlM2ViMmIiDQp9"
+		}
+	}
+}
diff --git a/pipeline/DistinctRows.json b/pipeline/DistinctRows.json
@@ -0,0 +1,33 @@
+{
+	"name": "DistinctRows",
+	"properties": {
+		"description": "Many examples of shareable data flows for distinct rows and deduping data",
+		"activities": [
+			{
+				"name": "DistinctRows",
+				"type": "ExecuteDataFlow",
+				"dependsOn": [],
+				"policy": {
+					"timeout": "7.00:00:00",
+					"retry": 0,
+					"retryIntervalInSeconds": 30,
+					"secureOutput": false,
+					"secureInput": false
+				},
+				"userProperties": [],
+				"typeProperties": {
+					"dataflow": {
+						"referenceName": "DistinctRowsAll",
+						"type": "DataFlowReference"
+					},
+					"compute": {
+						"coreCount": 8,
+						"computeType": "General"
+					},
+					"traceLevel": "Fine"
+				}
+			}
+		],
+		"annotations": []
+	}
+}
diff --git a/pipeline/DistinctRows1.json b/pipeline/DistinctRows1.json
@@ -0,0 +1,33 @@
+{
+	"name": "DistinctRows1",
+	"properties": {
+		"description": "Many examples of shareable data flows for distinct rows and deduping data",
+		"activities": [
+			{
+				"name": "DistinctRows",
+				"type": "ExecuteDataFlow",
+				"dependsOn": [],
+				"policy": {
+					"timeout": "7.00:00:00",
+					"retry": 0,
+					"retryIntervalInSeconds": 30,
+					"secureOutput": false,
+					"secureInput": false
+				},
+				"userProperties": [],
+				"typeProperties": {
+					"dataflow": {
+						"referenceName": "DistinctRowsAll1",
+						"type": "DataFlowReference"
+					},
+					"compute": {
+						"coreCount": 8,
+						"computeType": "General"
+					},
+					"traceLevel": "Fine"
+				}
+			}
+		],
+		"annotations": []
+	}
+}
diff --git a/publish_config.json b/publish_config.json
@@ -0,0 +1 @@
+{"publishBranch":"adf_publish"}