Skip to content

Commit 29688fb

Browse files
committed
New documentation and code tidy ups
1 parent 74a67ed commit 29688fb

File tree

6 files changed

+279
-32
lines changed

6 files changed

+279
-32
lines changed

Azure.DataPipelineTools.sln

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,15 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DataPipelineTools", "DataPi
99
EndProject
1010
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{2B3AB844-E7ED-4808-A57F-3315C4EE8E26}"
1111
ProjectSection(SolutionItems) = preProject
12+
example.runsettings = example.runsettings
1213
LICENSE = LICENSE
1314
EndProjectSection
1415
EndProject
1516
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DataPipelineTools.Tests", "DataPipelineTools.Tests\DataPipelineTools.Tests.csproj", "{A2C01394-16F9-4783-9629-2857C400D6E6}"
1617
EndProject
1718
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DataPipelineTools.Functions.Tests", "DataPipelineTools.Functions.Tests\DataPipelineTools.Functions.Tests.csproj", "{C2448E27-7F4C-4E1E-BAF7-D5344F330073}"
1819
EndProject
19-
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DataPipelineTools.Tests.Common", "DataPipelineTools.Tests.Common\DataPipelineTools.Tests.Common.csproj", "{6C304777-0D1A-45A6-A5E0-5849022F319A}"
20+
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DataPipelineTools.Tests.Common", "DataPipelineTools.Tests.Common\DataPipelineTools.Tests.Common.csproj", "{6C304777-0D1A-45A6-A5E0-5849022F319A}"
2021
EndProject
2122
Global
2223
GlobalSection(SolutionConfigurationPlatforms) = preSolution

DataPipelineTools.Functions/Common/FunctionsBase.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,14 @@ public FunctionsBase(ILogger<FunctionsBase> logger)
1414
_logger = logger;
1515
}
1616

17-
protected JObject GetTemplateResponse(IDataLakeConnectionConfig dataLakeConnectionConfig, object parameters)
17+
protected JObject GetTemplateResponse(IDataLakeConnectionConfig dataLakeConnectionConfig, object parameters, Microsoft.Azure.WebJobs.ExecutionContext context)
1818
{
1919
var assemblyInfo = AssemblyHelpers.GetAssemblyVersionInfoJson();
2020

2121
var responseJson = new JObject();
22+
23+
responseJson.Add("invocationId", context.InvocationId);
24+
2225
if (assemblyInfo.HasValues)
2326
responseJson.Add("debugInfo", assemblyInfo);
2427

DataPipelineTools.Functions/DataLake/DataLakeConfigFactory.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ public class DataLakeConfigFactory
2424
public const string OrderByColumnParam = "orderBy";
2525
public const string OrderByDescendingParam = "orderByDesc";
2626
public const string LimitParam = "limit";
27+
public const string FilterParam = "filter";
2728

2829
private readonly ILogger _logger;
2930
public DataLakeConfigFactory(ILogger<DataLakeConfigFactory> logger)
@@ -224,9 +225,8 @@ public DataLakeGetItemsConfig GetItemsConfig (HttpRequest req)
224225
private IEnumerable<Filter<DataLakeItem>> ParseFilters(HttpRequest req)
225226
{
226227
var filters = req.Query.Keys
227-
.Where(k => k.StartsWith("filter[") && k.EndsWith("]"))
228+
.Where(k => k.StartsWith($"{FilterParam}[") && k.EndsWith("]"))
228229
// Clean up the column name by removing the filter[...] parts
229-
//.Select(f => f[7..^1])
230230
.SelectMany(k => req.Query[k].Select(v => FilterFactory<DataLakeItem>.Create(k[7..^1], v, _logger)))
231231
.Where(f => f != null);
232232

DataPipelineTools.Functions/DataLake/DataLakeFunctions.cs

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public DataLakeFunctions(ILogger<DataLakeFunctions> logger, DataLakeConfigFactor
2626

2727
[FunctionName("DataLake-GetItems")]
2828
public async Task<IActionResult> DataLakeGetItems(
29-
[HttpTrigger(AuthorizationLevel.Function, "get" /*, "post"*/, Route = "DataLake/GetItems")] HttpRequest req)
29+
[HttpTrigger(AuthorizationLevel.Function, "get" /*, "post"*/, Route = "DataLake/GetItems")] HttpRequest req, ExecutionContext context)
3030
{
3131
req.GetQueryParameterDictionary();
3232

@@ -40,7 +40,7 @@ public async Task<IActionResult> DataLakeGetItems(
4040
var client = _clientFactory.GetDataLakeClient(dataLakeConfig);
4141
var controller = _serviceFactory.CreateDataLakeService(client);
4242

43-
var responseJson = GetTemplateResponse(dataLakeConfig, getItemsConfig);
43+
var responseJson = GetTemplateResponse(dataLakeConfig, getItemsConfig, context);
4444
var items = await controller.GetItemsAsync(dataLakeConfig, getItemsConfig);
4545
foreach (var item in items)
4646
responseJson.Add(item.Key, item.Value);
@@ -50,21 +50,20 @@ public async Task<IActionResult> DataLakeGetItems(
5050
catch (ArgumentException ex)
5151
{
5252
_logger.LogError(ex, ex.Message);
53-
return new BadRequestObjectResult($"{{ \"error\": \"{ex.Message}\" }}");
53+
return new BadRequestObjectResult($"{{\n \"invocationId\":\"{context.InvocationId}\",\n \"error\": \"{ex.Message}\"\n}}");
5454
}
5555
catch (Exception ex)
5656
{
5757
_logger.LogError(ex, ex.Message); // The simple message goes in the trace, but the full exception details are in the exception logging in Application Insights
58-
59-
return new BadRequestObjectResult("{ \"error\": \"An error occurred, see the Azure Function logs for more details\" }");
58+
return new BadRequestObjectResult($"{{\n \"invocationId\":\"{context.InvocationId}\",\n \"error\": \"An error occurred, see the Azure Function logs for more details\"\n}}");
6059
}
6160
}
6261

6362

6463

6564
[FunctionName("DataLake-CheckPathCase")]
6665
public async Task<IActionResult> DataLakeCheckPathCase(
67-
[HttpTrigger(AuthorizationLevel.Function, "get", Route = "DataLake/CheckPathCase")] HttpRequest req)
66+
[HttpTrigger(AuthorizationLevel.Function, "get", Route = "DataLake/CheckPathCase")] HttpRequest req, ExecutionContext context)
6867
{
6968
var userAgentKey = req.Headers.Keys.FirstOrDefault(k => k.ToLower() == "user-agent" || k.ToLower() == "useragent");
7069
_logger.LogInformation($"C# HTTP trigger function processed a request [User Agent: { (userAgentKey == null ? "Unknown" : req.Headers[userAgentKey].ToString()) }].");
@@ -89,7 +88,7 @@ public async Task<IActionResult> DataLakeCheckPathCase(
8988
// If the path could not be found as a directory, try for a file...
9089
validatedPath ??= await dataLakeService.CheckPathAsync(getItemsConfig.Path, false);
9190

92-
var responseJson = GetTemplateResponse(dataLakeConfig, getItemsConfig);
91+
var responseJson = GetTemplateResponse(dataLakeConfig, getItemsConfig, context);
9392
responseJson.Add("validatedPath", validatedPath);
9493

9594
return validatedPath != null ?
@@ -99,12 +98,12 @@ public async Task<IActionResult> DataLakeCheckPathCase(
9998
catch (ArgumentException ex)
10099
{
101100
_logger.LogError(ex.Message);
102-
return new BadRequestObjectResult($"{{ \"error\": \"{ex.Message}\" }}");
101+
return new BadRequestObjectResult($"{{\n \"invocationId\":\"{context.InvocationId}\",\n \"error\": \"{ex.Message}\"\n}}");
103102
}
104103
catch (Exception ex)
105104
{
106105
_logger.LogError(ex, ex.Message); // The simple message goes in the trace, but the full exception details are in the exception logging in Application Insights
107-
return new BadRequestObjectResult("{ \"error\": \"An error occurred, see the Azure Function logs for more details\" }");
106+
return new BadRequestObjectResult($"{{\n \"invocationId\":\"{context.InvocationId}\",\n \"error\": \"An error occurred, see the Azure Function logs for more details\"\n}}");
108107
}
109108
}
110109

Docs/DataLake.md

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
# Data Lake Functions
2+
3+
These functions are used to help interact with Azure Data lake in a more seamless manner.
4+
5+
## Generic Parameters
6+
The following parameters are use by all data lake functions.
7+
| Parameter | Type | Description |
8+
| ----------- | ----------- | ----------- |
9+
| account | String | The name of the Azure Storage Account. |
10+
| container | String | The name of the Azure Data Lake Container. |
11+
| path | String | The path to the file/folder within the data lake. For the root path use '/'.
12+
13+
---
14+
15+
## Authentication Parameters
16+
The data lake functions support authenticating to the data lake using the following options.
17+
- The Service Principal (SPN) of the deployed Azure Functions App
18+
- A user specified Service Principal. The service principal must be in the same tenant as the Azure Functions App.
19+
- A Shared Access Signature (SAS) token.
20+
- The Azure Storage Account key
21+
22+
You must only specify the parameters required for one of these authentication types for each function call. Specifying multiple types will return an error. The types are in the table below.
23+
24+
>Note: You can optionally provide the parameter `keyVault`. This allows the name of the secret in a key vault to be passed instead of a secret value. For this to work, the deployed Azure Functions App service principal must be granted access to read the secrets from the specified key vault.
25+
26+
27+
| Auth Type | Parameter | Required | Format | Description |
28+
| - | - | - | - | - |
29+
| Azure Functions SPN | N/A | | | To use the Azure Functions App Service Principal you do not need to provide any authentication parameters. |
30+
| User SPN | spnClientId | Yes | GUID | The client id of the SPN you want to use to authenticate calls to the data lake. Must be in the same tenant as the Azure Functions application. |
31+
| | spnClientSecret | Yes | String | The client secret for the SPN. This is either the secret for the SPN, or, if the `keyVault` parameter is specified is the name of the secret value in that Azure Key Vault. |
32+
| | keyVault | No | String | The name of the Azure Key Vault that contains the secret specified by the parameter `spnClientSecret`. |
33+
| SAS Token | sasToken | Yes | String | The SAS token for accessing storage account. Must be in the same tenant as the Azure Functions application. |
34+
| | keyVault | No | String | The name of the Azure Key Vault that contains the secret specified by the parameter `spnClientSecret`. |
35+
| Storage Account key | accountKey | Yes | String | The storage account key. This is either the secret for the SPN, or, if the `keyVault` parameter is specified is the name of the secret value in that Azure Key Vault. |
36+
| | keyVault | No | String | The name of the Azure Key Vault that contains the secret specified by the parameter `spnClientSecret`. |
37+
38+
### Responses
39+
Successful calls will return 200 (OK), failed calls will return 400 (Bad Request). In both cases the return body will be a JSON object with the result set or error details.
40+
41+
---
42+
43+
## CheckPathCase
44+
`https://<YourAzureFunctionsApp>.azurewebsites.net/api/DataLake/CheckPathCase`
45+
46+
When dealing with metadata driven processing, it is easy for a mistake in the path case in metadata to cause errors when accessing the lake, because Azure Data Lake paths are case sensitive. This function can be used to validate a path. If the path does not exist, the function checks for all paths that could match but with different casing. If one is found, that is returned, or if none/multiple matches are found an error is returned.
47+
48+
### Parameters
49+
The [generic](#generic-parameters) and [authentication](#authentication-parameters) parameters above are mandatory, and are the only parameters required.
50+
51+
### Return Values
52+
Returned values JSON objects. Below is an example of a successful call:
53+
```
54+
{
55+
"invocationId": "37c34c08-bb41-4176-b542-8adc3617f28f",
56+
"debugInfo": {
57+
"informationalVersion": "1.0.0"
58+
},
59+
"storageContainerUrl": "https://<YourAzureFunctionsApp>.dfs.core.windows.net/myContainer",
60+
"authType": "FunctionsServicePrincipal",
61+
"parameters": {
62+
"Path": "TESTDATA"
63+
},
64+
"validatedPath": "TestData"
65+
}
66+
```
67+
68+
An example of a call with a mandatory parameter missing:
69+
```
70+
{
71+
"invocationId":"e28970da-13f2-46be-848e-c25de54539a1",
72+
"error": "Mandatory parameter 'account' was not provided."
73+
}
74+
```
75+
76+
An example of a call with a mandatory parameter missing:
77+
```
78+
{
79+
"invocationId":"e28970da-13f2-46be-848e-c25de54539a1",
80+
"error": "An error occurred, see the Azure Function logs for more details"
81+
}
82+
```
83+
---
84+
85+
## GetItems
86+
`https://<YourAzureFunctionsApp>.azurewebsites.net/api/DataLake/GetItems`
87+
88+
This function is intended as an improved version of the ADF *Get Metadata* activity. It can be called from ADF using the *Execute Function* activity, using the following parameters.
89+
90+
### Parameters
91+
The [generic](#generic-parameters) and [authentication](#authentication-parameters) parameters above are mandatory. In addition the following optional parameters can be specified.
92+
93+
| Parameter | Type | Description |
94+
| ----------- | ----------- | ----------- |
95+
| ignoreDirectoryCase | Bool | This will call checkPathCase on the path parameter before getting the items. This means that if the path is incorrectly cased, but there is only one path that matches when looking case-insensitively, the function will return results. |
96+
| limit | String | The number of results to return. |
97+
| filter[PropertyName] | String | This allows filtering the results using the properties of items in the result set. Format is `operator:value` allowing flexibility building filters. Valid `PropertyName` options are any of the returned properties of a file or folder. The `like` operator matching supports full .Net style regular expressions. |
98+
| orderBy | String | The property to order the result set by. Valid properties are those of the returned json for each object, eg `LastModified`. |
99+
| orderByDesc | Bool | Sorts the results descending if true. Default when not specified is false. Used with ordering on `LastModified` and a limit of 1 will find the most recent file matching a filter. |
100+
| recursive | Bool | Look through folders recursively. |
101+
102+
### Filter Types
103+
When providing a filter parameter, there are a number of operators that can be used. The format is `filter[PropertyName]=operator:value`.
104+
| Operator | Description |
105+
| eq | Check if the property named `PropertyName' is equal to the value provided |
106+
| ne | Check if the property named `PropertyName' is not equal to the value provided |
107+
| lt | Check if the property named `PropertyName' is less than the value provided |
108+
| gt | Check if the property named `PropertyName' is greater than to the value provided |
109+
| le | Check if the property named `PropertyName' is less than or equal to the value provided |
110+
| ge | Check if the property named `PropertyName' is greater than or equal to the value provided |
111+
| like | Check if the property named `PropertyName' matches the pattern provided. You can use `*` for wildcards, but .Net sytle regular expressions are also supported. |
112+
113+
#### Examples:
114+
Filter the results to return only files:
115+
```
116+
filter[IsDirectory]=eq:false
117+
```
118+
119+
Filter the results to return only folders:
120+
```
121+
filter[IsDirectory]=eq:true
122+
```
123+
124+
Filter the results to return files and folders modified since 2021-09-01 14:00:00:
125+
```
126+
filter[LastModified]=ge:2021-09-01 14:00:00
127+
```
128+
129+
Filter the results to return only parquet files using a wildcard:
130+
```
131+
filter[Name]=like:*.parquet
132+
```
133+
134+
Filter the results to return only files or folders starting with *'abc'* or *'xzy'* using a regular expression:
135+
```
136+
filter[Name]=like:(abc|xyz)*
137+
```
138+
139+
We can also combine multiple filter using `&`. for example to find files modified in september 2021. We could add a orderBy to this too to allow processing files in a date range in order...
140+
```
141+
filter[IsDirectory]=eq:false&filter[LastModified]=ge:2021-09-01 00:00:00&filter[LastModified]=lt:2021-10-01 00:00:00
142+
```
143+
144+
145+
> Note: When using filters, you must URL encode any special characters when sending the request. This is especially important for regular expression filters.
146+
147+
### Return Values
148+
If parameters are used incorrectly, the returned JSON will have the error details. All other errors return a simple, generic error message, but the Azure Functions app will have detailed logging available for the execution.
149+
150+
151+
Returned values JSON objects. Below is an example of a successful call:
152+
```
153+
{
154+
"invocationId": "c21b69dc-9e76-42da-9953-ec63519f378a",
155+
"debugInfo": {
156+
"informationalVersion": "1.0.0"
157+
},
158+
"storageContainerUrl": "https://<YourAzureFunctionsApp>.dfs.core.windows.net/myContainer",
159+
"clientId": "f4b9d6e7-2753-44c6-a579-0bd77caa287d",
160+
"authType": "UserServicePrincipal",
161+
"parameters": {
162+
"Path": "/",
163+
"IgnoreDirectoryCase": false,
164+
"Recursive": true,
165+
"OrderByColumn": null,
166+
"OrderByDescending": false,
167+
"Limit": 0,
168+
"Filters": [
169+
{
170+
"PropertyName": "IsDirectory",
171+
"Operator": "eq",
172+
"Value": "true",
173+
"ErrorMessage": null
174+
}
175+
]
176+
},
177+
"fileCount": 3,
178+
"files": [
179+
{
180+
"Name": "TestData",
181+
"Directory": "",
182+
"FullPath": "TestData",
183+
"Url": "https://<YourAzureDataLake>.dfs.core.windows.net/myContainer/TestData",
184+
"IsDirectory": true,
185+
"ContentLength": 0,
186+
"LastModified": "2021-09-09T17:23:14Z"
187+
},
188+
{
189+
"Name": "TestFolder1",
190+
"Directory": "TestData",
191+
"FullPath": "TestData/TestFolder1",
192+
"Url": "https://<YourAzureDataLake>.dfs.core.windows.net/myContainer/TestData/TestFolder1",
193+
"IsDirectory": true,
194+
"ContentLength": 0,
195+
"LastModified": "2021-09-09T17:23:14Z"
196+
},
197+
{
198+
"Name": "TestFolder2",
199+
"Directory": "TestData",
200+
"FullPath": "TestData/TestFolder2",
201+
"Url": "https://<YourAzureDataLake>.dfs.core.windows.net/myContainer/TestData/TestFolder2",
202+
"IsDirectory": true,
203+
"ContentLength": 0,
204+
"LastModified": "2021-09-09T17:23:14Z"
205+
}
206+
]
207+
}
208+
```
209+
> Note: If not files are returned, then the `fileCount` will be 0, and the `files` property will be an empty array.
210+
211+
An example of a call with a mandatory parameter missing:
212+
```
213+
{
214+
"invocationId":"e28970da-13f2-46be-848e-c25de54539a1",
215+
"error": "Mandatory parameter 'account' was not provided."
216+
}
217+
```
218+
219+
An example of a call with a mandatory parameter missing:
220+
```
221+
{
222+
"invocationId":"e28970da-13f2-46be-848e-c25de54539a1",
223+
"error": "An error occurred, see the Azure Function logs for more details"
224+
}
225+
```
226+
227+

0 commit comments

Comments
 (0)