Skip to content

Commit a2ac9da

Browse files
authored
Merge pull request #2 from learntocloud/datapipeline
finished file processing
2 parents efff872 + b4c1cac commit a2ac9da

File tree

9 files changed

+368
-7
lines changed

9 files changed

+368
-7
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using System.Text;
5+
using System.Text.Json.Serialization;
6+
using System.Threading.Tasks;
7+
8+
namespace azure_project_generator
9+
{
10+
public class CertServiceDocument
11+
{
12+
[JsonPropertyName("id")]
13+
public string id { get; set; } // Unique identifier for the document
14+
15+
[JsonPropertyName("certificationServiceKey")]
16+
public string CertificationServiceKey { get; set; } // Composite key
17+
[JsonPropertyName("certificationCode")]
18+
public string CertificationCode { get; set; } // The certification code
19+
[JsonPropertyName("certificationName")]
20+
public string CertificationName { get; set; } // The certification name
21+
[JsonPropertyName("skillName")]
22+
public string SkillName { get; set; } // The skill associated with this certification
23+
[JsonPropertyName("topicName")]
24+
public string TopicName { get; set; } // The topic within the skill
25+
[JsonPropertyName("serviceName")]
26+
public string ServiceName { get; set; } // The service relevant to this certification and skill
27+
[JsonPropertyName("contextSentence")]
28+
public string ContextSentence { get; set; } // The combined sentence
29+
[JsonPropertyName("contextVector")]
30+
public float[] ContextVector { get; set; } // Example vector embedding generated from the sentence
31+
32+
}
33+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
using Newtonsoft.Json;
2+
using System;
3+
using System.Collections.Generic;
4+
using System.Linq;
5+
using System.Text;
6+
using System.Threading.Tasks;
7+
8+
namespace azure_project_generator
9+
{
10+
internal class MappedService
11+
{
12+
[JsonProperty("certificationCode")]
13+
public string CertificationCode { get; set; }
14+
15+
[JsonProperty("certificationName")]
16+
public string CertificationName { get; set; }
17+
18+
[JsonProperty("skillName")]
19+
public string SkillName { get; set; }
20+
21+
[JsonProperty("topicName")]
22+
public string TopicName { get; set; }
23+
24+
[JsonProperty("serviceName")]
25+
public string ServiceName { get; set; }
26+
}
27+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
using Microsoft.Azure.Functions.Worker;
2+
using System;
3+
using System.Collections.Generic;
4+
using System.Linq;
5+
using System.Text;
6+
using System.Threading.Tasks;
7+
8+
namespace azure_project_generator
9+
{
10+
public class MultipleOutput
11+
{
12+
13+
[CosmosDBOutput("%CosmosDb%", "%CosmosContainerOut%", Connection = "CosmosDBConnection")]
14+
public CertServiceDocument CertServiceDocument { get; set; }
15+
16+
[BlobOutput("certdataarchive/{name}", Connection = "AzureWebJobsStorage")]
17+
public string ArchivedContent { get; set; }
18+
19+
}
20+
}
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
using Azure;
2+
using Azure.AI.OpenAI;
3+
using Microsoft.Azure.Functions.Worker;
4+
using Microsoft.Extensions.Logging;
5+
using Newtonsoft.Json;
6+
using Newtonsoft.Json.Linq;
7+
using Newtonsoft.Json.Schema;
8+
using Newtonsoft.Json.Schema.Generation;
9+
using OpenAI.Embeddings;
10+
11+
namespace azure_project_generator
12+
{
13+
public class ProcessFile
14+
{
15+
private readonly ILogger<ProcessFile> _logger;
16+
private readonly EmbeddingClient _embeddingClient;
17+
18+
public ProcessFile(ILogger<ProcessFile> logger,
19+
EmbeddingClient embeddingClient)
20+
{
21+
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
22+
_embeddingClient = embeddingClient ?? throw new ArgumentNullException(nameof(embeddingClient));
23+
}
24+
25+
[Function(nameof(ProcessFile))]
26+
public async Task<MultipleOutput> Run(
27+
[BlobTrigger("certdata/{name}", Connection = "AzureWebJobsStorage")] string content,
28+
string name)
29+
{
30+
_logger.LogInformation($"Processing blob: {name}");
31+
32+
if (string.IsNullOrWhiteSpace(content))
33+
{
34+
_logger.LogError("Blob content is empty or whitespace.");
35+
return new MultipleOutput { CertServiceDocument = null, ArchivedContent = null};
36+
}
37+
38+
if (!ValidateJsonContent(content))
39+
{
40+
return new MultipleOutput { CertServiceDocument = null, ArchivedContent = null};
41+
}
42+
43+
var mappedServiceData = JsonConvert.DeserializeObject<MappedService>(content);
44+
if (mappedServiceData == null)
45+
{
46+
_logger.LogError("Failed to deserialize content to MappedService.");
47+
return new MultipleOutput { CertServiceDocument = null, ArchivedContent = null};
48+
}
49+
50+
string contextSentence = GenerateContextSentence(mappedServiceData);
51+
float[] contentVector = await GenerateEmbeddingsAsync(contextSentence);
52+
53+
var certServiceDocument = CreateCertServiceDocument(mappedServiceData, contextSentence, contentVector);
54+
55+
_logger.LogInformation("Document created successfully.");
56+
_logger.LogInformation($"Archiving blob: {name}");
57+
58+
return new MultipleOutput
59+
{
60+
CertServiceDocument = certServiceDocument,
61+
ArchivedContent = content
62+
63+
};
64+
}
65+
66+
67+
private async Task<string> ReadBlobContentAsync(Stream stream)
68+
{
69+
try
70+
{
71+
using var reader = new StreamReader(stream);
72+
return await reader.ReadToEndAsync();
73+
}
74+
catch (IOException ex)
75+
{
76+
_logger.LogError(ex, "Error reading blob content");
77+
return null;
78+
}
79+
}
80+
81+
private bool ValidateJsonContent(string content)
82+
{
83+
try
84+
{
85+
var generator = new JSchemaGenerator();
86+
JSchema schema = generator.Generate(typeof(MappedService));
87+
88+
JToken jsonContent = JToken.Parse(content);
89+
bool isValid = jsonContent.IsValid(schema, out IList<string> messages);
90+
91+
if (!isValid)
92+
{
93+
foreach (var message in messages)
94+
{
95+
_logger.LogError($"Schema validation error: {message}");
96+
}
97+
}
98+
else
99+
{
100+
_logger.LogInformation("JSON content is valid against the schema.");
101+
}
102+
103+
return isValid;
104+
}
105+
catch (JsonException ex)
106+
{
107+
_logger.LogError(ex, "JSON parsing error during validation");
108+
return false;
109+
}
110+
catch (Exception ex)
111+
{
112+
_logger.LogError(ex, "Unexpected error during JSON validation");
113+
return false;
114+
}
115+
}
116+
117+
private string GenerateContextSentence(MappedService data) =>
118+
$"The {data.CertificationCode} {data.CertificationName} certification includes the skill of {data.SkillName}. Within this skill, there is a focus on the topic of {data.TopicName}, particularly through the use of the service {data.ServiceName}.";
119+
120+
private async Task<float[]> GenerateEmbeddingsAsync(string content)
121+
{
122+
try
123+
{
124+
_logger.LogInformation("Generating embedding...");
125+
var embeddingResult = await _embeddingClient.GenerateEmbeddingAsync(content).ConfigureAwait(false);
126+
_logger.LogInformation("Embedding created successfully.");
127+
return embeddingResult.Value.Vector.ToArray();
128+
129+
}
130+
catch (RequestFailedException ex)
131+
{
132+
_logger.LogError(ex, "Azure OpenAI API request failed");
133+
throw;
134+
}
135+
catch (Exception ex)
136+
{
137+
_logger.LogError(ex, "Error generating embedding");
138+
throw;
139+
}
140+
}
141+
142+
private CertServiceDocument CreateCertServiceDocument(MappedService data, string contextSentence, float[] contentVector) =>
143+
new CertServiceDocument
144+
{
145+
id = Guid.NewGuid().ToString(),
146+
CertificationServiceKey = $"{data.CertificationCode}-{data.ServiceName}",
147+
CertificationCode = data.CertificationCode,
148+
CertificationName = data.CertificationName,
149+
SkillName = data.SkillName,
150+
TopicName = data.TopicName,
151+
ServiceName = data.ServiceName,
152+
ContextSentence = contextSentence,
153+
ContextVector = contentVector
154+
};
155+
}
156+
}

azure-project-generator/Program.cs

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,37 @@
1+
using Azure;
2+
using Azure.AI.OpenAI;
13
using Microsoft.Azure.Functions.Worker;
4+
using Microsoft.Extensions.Configuration;
25
using Microsoft.Extensions.DependencyInjection;
36
using Microsoft.Extensions.Hosting;
47

58
var host = new HostBuilder()
69
.ConfigureFunctionsWorkerDefaults()
7-
.ConfigureServices(services =>
10+
.ConfigureServices((context, services) =>
811
{
912
services.AddApplicationInsightsTelemetryWorkerService();
1013
services.ConfigureFunctionsApplicationInsights();
14+
15+
// Get configuration
16+
var config = context.Configuration;
17+
18+
// Initialize Azure OpenAI client
19+
string keyFromEnvironment = config["AZURE_OPENAI_API_KEY"];
20+
string endpointFromEnvironment = config["AZURE_OPENAI_API_ENDPOINT"];
21+
string embeddingsDeployment = config["EMBEDDINGS_DEPLOYMENT"];
22+
23+
if (string.IsNullOrEmpty(keyFromEnvironment) || string.IsNullOrEmpty(endpointFromEnvironment) || string.IsNullOrEmpty(embeddingsDeployment))
24+
{
25+
throw new InvalidOperationException("Required Azure OpenAI configuration is missing.");
26+
}
27+
28+
AzureOpenAIClient azureClient = new(
29+
new Uri(endpointFromEnvironment),
30+
new AzureKeyCredential(keyFromEnvironment));
31+
32+
// Register EmbeddingClient as a singleton
33+
services.AddSingleton(azureClient.GetEmbeddingClient(embeddingsDeployment));
1134
})
1235
.Build();
1336

14-
host.Run();
37+
host.Run();
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
{
2+
"$schema": "https://schema.management.azure.com/schemas/2018-05-01/subscriptionDeploymentTemplate.json#",
3+
"contentVersion": "1.0.0.0",
4+
"parameters": {
5+
"resourceGroupName": {
6+
"type": "string",
7+
"defaultValue": "azureprojectgenerator-rg",
8+
"metadata": {
9+
"_parameterType": "resourceGroup",
10+
"description": "Name of the resource group for the resource. It is recommended to put resources under same resource group for better tracking."
11+
}
12+
},
13+
"resourceGroupLocation": {
14+
"type": "string",
15+
"defaultValue": "eastus",
16+
"metadata": {
17+
"_parameterType": "location",
18+
"description": "Location of the resource group. Resource groups could have different location than resources."
19+
}
20+
},
21+
"resourceLocation": {
22+
"type": "string",
23+
"defaultValue": "[parameters('resourceGroupLocation')]",
24+
"metadata": {
25+
"_parameterType": "location",
26+
"description": "Location of the resource. By default use resource group's location, unless the resource provider is not supported there."
27+
}
28+
}
29+
},
30+
"resources": [
31+
{
32+
"type": "Microsoft.Resources/resourceGroups",
33+
"name": "[parameters('resourceGroupName')]",
34+
"location": "[parameters('resourceGroupLocation')]",
35+
"apiVersion": "2019-10-01"
36+
},
37+
{
38+
"type": "Microsoft.Resources/deployments",
39+
"name": "[concat(parameters('resourceGroupName'), 'Deployment', uniqueString(concat('azureprojectgenstor', subscription().subscriptionId)))]",
40+
"resourceGroup": "[parameters('resourceGroupName')]",
41+
"apiVersion": "2019-10-01",
42+
"dependsOn": [
43+
"[parameters('resourceGroupName')]"
44+
],
45+
"properties": {
46+
"mode": "Incremental",
47+
"template": {
48+
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
49+
"contentVersion": "1.0.0.0",
50+
"resources": [
51+
{
52+
"sku": {
53+
"name": "Standard_RAGRS",
54+
"tier": "Standard"
55+
},
56+
"kind": "StorageV2",
57+
"name": "azureprojectgenstor",
58+
"type": "Microsoft.Storage/storageAccounts",
59+
"location": "[parameters('resourceLocation')]",
60+
"apiVersion": "2017-10-01"
61+
}
62+
]
63+
}
64+
}
65+
}
66+
],
67+
"metadata": {
68+
"_dependencyType": "storage.azure"
69+
}
70+
}

azure-project-generator/Properties/serviceDependencies.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,14 @@
66
"storage1": {
77
"type": "storage",
88
"connectionId": "AzureWebJobsStorage"
9+
},
10+
"secrets1": {
11+
"type": "secrets"
12+
},
13+
"storage2": {
14+
"type": "storage",
15+
"connectionId": "azurestorage",
16+
"dynamicId": null
917
}
1018
}
1119
}

azure-project-generator/Properties/serviceDependencies.local.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,17 @@
66
"storage1": {
77
"type": "storage.emulator",
88
"connectionId": "AzureWebJobsStorage"
9+
},
10+
"secrets1": {
11+
"type": "secrets.user"
12+
},
13+
"storage2": {
14+
"serviceConnectorResourceId": "/subscriptions/[parameters('subscriptionId')]/resourceGroups/[parameters('resourceGroupName')]/providers/Microsoft.ServiceLinker/locations/eastus/connectors/azurestorage_B223BFB07A",
15+
"secretStore": "LocalSecretsFile",
16+
"resourceId": "/subscriptions/[parameters('subscriptionId')]/resourceGroups/[parameters('resourceGroupName')]/providers/Microsoft.Storage/storageAccounts/azureprojectgenstor",
17+
"type": "storage.azure",
18+
"connectionId": "azurestorage",
19+
"dynamicId": null
920
}
1021
}
1122
}

0 commit comments

Comments
 (0)