Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions dataflow/DistinctRowsAll.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"name": "DistinctRowsAll",
"properties": {
"description": "Distinct Rows using fuzzy, key, all cols",
"type": "MappingDataFlow",
"typeProperties": {
"sources": [
{
"dataset": {
"referenceName": "MoviesD2",
"type": "DatasetReference"
},
"name": "MoviesCSV"
}
],
"sinks": [
{
"dataset": {
"referenceName": "folderout",
"type": "DatasetReference"
},
"name": "OutputDistinctData"
}
],
"transformations": [
{
"name": "DistinctRows"
},
{
"name": "RowCountDistinct"
},
{
"name": "OriginalData"
},
{
"name": "RowCountOrig"
},
{
"name": "DistinctAllCols"
},
{
"name": "FuzzyMatch"
},
{
"name": "RowCountAll"
},
{
"name": "RowCountFuzzy"
}
],
"script": "source(output(\n\t\tmovie as string,\n\t\ttitle as string,\n\t\tgenres as string,\n\t\tyear as string,\n\t\tRating as string,\n\t\tRottenTomato as string\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\tignoreNoFilesFound: false) ~> MoviesCSV\nMoviesCSV aggregate(groupBy(movie),\n\teach(match(name!='movie'), $$ = first($$))) ~> DistinctRows\nDistinctRows aggregate(rowcount_agg = count(1)) ~> RowCountDistinct\nMoviesCSV select(mapColumn(\n\t\tmovie,\n\t\ttitle,\n\t\tgenres,\n\t\tyear,\n\t\tRating,\n\t\tRottenTomato\n\t),\n\tskipDuplicateMapInputs: false,\n\tskipDuplicateMapOutputs: false) ~> OriginalData\nOriginalData aggregate(rowcount_orig = count(1)) ~> RowCountOrig\nMoviesCSV aggregate(groupBy(mycols = sha2(256,columns())),\n\teach(match(true()), $$ = first($$))) ~> DistinctAllCols\nMoviesCSV aggregate(groupBy(colsfuzzy = sha2(256,movie,year,soundex(title))),\n\teach(match(true()), $$ = first($$))) ~> FuzzyMatch\nDistinctAllCols aggregate(rowcountall = count(1)) ~> RowCountAll\nFuzzyMatch aggregate(rowcountfuzzy = count(1)) ~> RowCountFuzzy\nDistinctRows sink(allowSchemaDrift: true,\n\tvalidateSchema: false,\n\tpartitionBy('hash', 1)) ~> OutputDistinctData"
}
}
}
54 changes: 54 additions & 0 deletions dataflow/DistinctRowsAll1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"name": "DistinctRowsAll1",
"properties": {
"description": "Distinct Rows using fuzzy, key, all cols",
"type": "MappingDataFlow",
"typeProperties": {
"sources": [
{
"dataset": {
"referenceName": "MoviesD2",
"type": "DatasetReference"
},
"name": "MoviesCSV"
}
],
"sinks": [
{
"dataset": {
"referenceName": "folderout",
"type": "DatasetReference"
},
"name": "OutputDistinctData"
}
],
"transformations": [
{
"name": "DistinctRows"
},
{
"name": "RowCountDistinct"
},
{
"name": "OriginalData"
},
{
"name": "RowCountOrig"
},
{
"name": "DistinctAllCols"
},
{
"name": "FuzzyMatch"
},
{
"name": "RowCountAll"
},
{
"name": "RowCountFuzzy"
}
],
"script": "source(output(\n\t\tmovie as string,\n\t\ttitle as string,\n\t\tgenres as string,\n\t\tyear as string,\n\t\tRating as string,\n\t\tRottenTomato as string\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\tignoreNoFilesFound: false) ~> MoviesCSV\nMoviesCSV aggregate(groupBy(movie),\n\teach(match(name!='movie'), $$ = first($$))) ~> DistinctRows\nDistinctRows aggregate(rowcount_agg = count(1)) ~> RowCountDistinct\nMoviesCSV select(mapColumn(\n\t\tmovie,\n\t\ttitle,\n\t\tgenres,\n\t\tyear,\n\t\tRating,\n\t\tRottenTomato\n\t),\n\tskipDuplicateMapInputs: false,\n\tskipDuplicateMapOutputs: false) ~> OriginalData\nOriginalData aggregate(rowcount_orig = count(1)) ~> RowCountOrig\nMoviesCSV aggregate(groupBy(mycols = sha2(256,columns())),\n\teach(match(true()), $$ = first($$))) ~> DistinctAllCols\nMoviesCSV aggregate(groupBy(colsfuzzy = sha2(256,movie,year,soundex(title))),\n\teach(match(true()), $$ = first($$))) ~> FuzzyMatch\nDistinctAllCols aggregate(rowcountall = count(1)) ~> RowCountAll\nFuzzyMatch aggregate(rowcountfuzzy = count(1)) ~> RowCountFuzzy\nDistinctRows sink(allowSchemaDrift: true,\n\tvalidateSchema: false,\n\tpartitionBy('hash', 1)) ~> OutputDistinctData"
}
}
}
54 changes: 54 additions & 0 deletions dataflow/DistinctRowsAll_copy1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"name": "DistinctRowsAll_copy1",
"properties": {
"description": "Distinct Rows using fuzzy, key, all cols",
"type": "MappingDataFlow",
"typeProperties": {
"sources": [
{
"dataset": {
"referenceName": "MoviesD2",
"type": "DatasetReference"
},
"name": "MoviesCSV"
}
],
"sinks": [
{
"dataset": {
"referenceName": "folderout",
"type": "DatasetReference"
},
"name": "OutputDistinctData"
}
],
"transformations": [
{
"name": "DistinctRows"
},
{
"name": "RowCountDistinct"
},
{
"name": "OriginalData"
},
{
"name": "RowCountOrig"
},
{
"name": "DistinctAllCols"
},
{
"name": "FuzzyMatch"
},
{
"name": "RowCountAll"
},
{
"name": "RowCountFuzzy"
}
],
"script": "source(output(\n\t\tmovie as string,\n\t\ttitle as string,\n\t\tgenres as string,\n\t\tyear as string,\n\t\tRating as string,\n\t\tRottenTomato as string\n\t),\n\tallowSchemaDrift: true,\n\tvalidateSchema: false,\n\tignoreNoFilesFound: false) ~> MoviesCSV\nMoviesCSV aggregate(groupBy(movie),\n\teach(match(name!='movie'), $$ = first($$))) ~> DistinctRows\nDistinctRows aggregate(rowcount_agg = count(1)) ~> RowCountDistinct\nMoviesCSV select(mapColumn(\n\t\tmovie,\n\t\ttitle,\n\t\tgenres,\n\t\tyear,\n\t\tRating,\n\t\tRottenTomato\n\t),\n\tskipDuplicateMapInputs: false,\n\tskipDuplicateMapOutputs: false) ~> OriginalData\nOriginalData aggregate(rowcount_orig = count(1)) ~> RowCountOrig\nMoviesCSV aggregate(groupBy(mycols = sha2(256,columns())),\n\teach(match(true()), $$ = first($$))) ~> DistinctAllCols\nMoviesCSV aggregate(groupBy(colsfuzzy = sha2(256,movie,year,soundex(title))),\n\teach(match(true()), $$ = first($$))) ~> FuzzyMatch\nDistinctAllCols aggregate(rowcountall = count(1)) ~> RowCountAll\nFuzzyMatch aggregate(rowcountfuzzy = count(1)) ~> RowCountFuzzy\nDistinctRows sink(allowSchemaDrift: true,\n\tvalidateSchema: false,\n\tpartitionBy('hash', 1)) ~> OutputDistinctData"
}
}
}
48 changes: 48 additions & 0 deletions dataset/MoviesD2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"name": "MoviesD2",
"properties": {
"linkedServiceName": {
"referenceName": "DAzureBlobStorage1",
"type": "LinkedServiceReference"
},
"annotations": [],
"type": "DelimitedText",
"typeProperties": {
"location": {
"type": "AzureBlobStorageLocation",
"fileName": "moviesDB2.csv",
"container": "csv"
},
"columnDelimiter": ",",
"escapeChar": "\\",
"firstRowAsHeader": true,
"quoteChar": "\""
},
"schema": [
{
"name": "movie",
"type": "String"
},
{
"name": "title",
"type": "String"
},
{
"name": "genres",
"type": "String"
},
{
"name": "year",
"type": "String"
},
{
"name": "Rating",
"type": "String"
},
{
"name": "RottenTomato",
"type": "String"
}
]
}
}
22 changes: 22 additions & 0 deletions dataset/folderout.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"name": "folderout",
"properties": {
"linkedServiceName": {
"referenceName": "DAzureBlobStorage1",
"type": "LinkedServiceReference"
},
"annotations": [],
"type": "DelimitedText",
"typeProperties": {
"location": {
"type": "AzureBlobStorageLocation",
"container": "data"
},
"columnDelimiter": ",",
"escapeChar": "\\",
"firstRowAsHeader": false,
"quoteChar": "\""
},
"schema": []
}
}
12 changes: 12 additions & 0 deletions linkedService/AzureDataLakeStorage1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"name": "AzureDataLakeStorage1",
"type": "Microsoft.DataFactory/factories/linkedservices",
"properties": {
"annotations": [],
"type": "AzureBlobFS",
"typeProperties": {
"url": "https://glukstoacc.dfs.core.windows.net/",
"encryptedCredential": "ew0KICAiVmVyc2lvbiI6ICIyMDE3LTExLTMwIiwNCiAgIlByb3RlY3Rpb25Nb2RlIjogIktleSIsDQogICJTZWNyZXRDb250ZW50VHlwZSI6ICJQbGFpbnRleHQiLA0KICAiQ3JlZGVudGlhbElkIjogIkRBVEFGQUNUT1JZQEY1QzY0MjM0LTRCNzktNDE0Qi1CQTA5LTdGRTk0NTBBMzU5RF9lNTg4YjNlYy0zODMzLTQ1OWMtYWRlZi05NjIxNGU2OTM4NGQiDQp9"
}
}
}
12 changes: 12 additions & 0 deletions linkedService/DAzureBlobStorage1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"name": "DAzureBlobStorage1",
"type": "Microsoft.DataFactory/factories/linkedservices",
"properties": {
"annotations": [],
"type": "AzureBlobStorage",
"typeProperties": {
"connectionString": "DefaultEndpointsProtocol=https;AccountName=glukstoacc;EndpointSuffix=core.windows.net;",
"encryptedCredential": "ew0KICAiVmVyc2lvbiI6ICIyMDE3LTExLTMwIiwNCiAgIlByb3RlY3Rpb25Nb2RlIjogIktleSIsDQogICJTZWNyZXRDb250ZW50VHlwZSI6ICJQbGFpbnRleHQiLA0KICAiQ3JlZGVudGlhbElkIjogIkRBVEFGQUNUT1JZQEY1QzY0MjM0LTRCNzktNDE0Qi1CQTA5LTdGRTk0NTBBMzU5RF9lMDA4NmI4OC03MWE2LTQ3YTYtYWRlMy00ZmVlM2VlM2ViMmIiDQp9"
}
}
}
33 changes: 33 additions & 0 deletions pipeline/DistinctRows.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"name": "DistinctRows",
"properties": {
"description": "Many examples of shareable data flows for distinct rows and deduping data",
"activities": [
{
"name": "DistinctRows",
"type": "ExecuteDataFlow",
"dependsOn": [],
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"userProperties": [],
"typeProperties": {
"dataflow": {
"referenceName": "DistinctRowsAll",
"type": "DataFlowReference"
},
"compute": {
"coreCount": 8,
"computeType": "General"
},
"traceLevel": "Fine"
}
}
],
"annotations": []
}
}
33 changes: 33 additions & 0 deletions pipeline/DistinctRows1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"name": "DistinctRows1",
"properties": {
"description": "Many examples of shareable data flows for distinct rows and deduping data",
"activities": [
{
"name": "DistinctRows",
"type": "ExecuteDataFlow",
"dependsOn": [],
"policy": {
"timeout": "7.00:00:00",
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": false,
"secureInput": false
},
"userProperties": [],
"typeProperties": {
"dataflow": {
"referenceName": "DistinctRowsAll1",
"type": "DataFlowReference"
},
"compute": {
"coreCount": 8,
"computeType": "General"
},
"traceLevel": "Fine"
}
}
],
"annotations": []
}
}
1 change: 1 addition & 0 deletions publish_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"publishBranch":"adf_publish"}
Loading