Merge branch 'feature/tca' into main

drandrewkane · drandrewkane · commit 761f211c70fe · 2022-01-07T16:51:27.000Z
diff --git a/pca-server/src/pca/pca-aws-file-drop-trigger.py b/pca-server/src/pca/pca-aws-file-drop-trigger.py
@@ -1,3 +1,11 @@
+"""
+This python function is triggered when a new audio file is dropped into the S3 bucket that has
+been configured for audio ingestion.  It will ensure that no Transcribe job already exists for this
+filename, and will then trigger the main Step Functions workflow to process this file.
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+"""
 import json
 import urllib.parse
 import boto3
@@ -7,7 +15,6 @@
 def lambda_handler(event, context):
     # Load our configuration
     cf.loadConfiguration()
-    print("S3 Event: " + str(event["Records"][0]))
 
     # Get the object from the event and validate it exists
     s3 = boto3.client("s3")
diff --git a/pca-server/src/pca/pca-aws-sf-bulk-files-count.py b/pca-server/src/pca/pca-aws-sf-bulk-files-count.py
@@ -1,15 +1,18 @@
+"""
+This python function is part of the bulk files workflow.  The system will load the Bulk configuration values
+once, and re-use them throughout the run, so the config values at the start of the run will remain valid.
+There is not quick way to count the files in an S3 bucket, so rather than track what's left in the bucket
+we just care about having any left to process and instead count how far we've gotten instead.
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+"""
 import pcaconfiguration as cf
 import copy
 import boto3
 
+
 def lambda_handler(event, context):
-    """
-    Entrypoint for bulk loading audio files.  The system will load the Bulk configuration values
-    once, and re-use them throughout the run, so the config values at the start of the run will
-    remain valid.  There is not quick way to count the files in an S3 bucket, so rather than track
-    what's left in the bucket we just care about having any left to process and instead count
-    how far we've gotten instead.
-    """
 
     # Get our params, looking them up if we haven't got them
     if "sourceBucket" in event:
@@ -46,6 +49,7 @@ def lambda_handler(event, context):
     # Return current event data
     return sfData
 
+
 if __name__ == "__main__":
     event = {}
     print(lambda_handler(event, ""))
diff --git a/pca-server/src/pca/pca-aws-sf-bulk-move-files.py b/pca-server/src/pca/pca-aws-sf-bulk-move-files.py
@@ -1,11 +1,17 @@
+"""
+This python function is part of the bulk files workflow.  Based upon the queueSpace parameter, this will
+move up to that many files into the PCA audio bucket, but only up to a maximum number as specified by
+the dripRate - this ensures that we don't overload they system
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+"""
 import copy
 import boto3
 
+
 def lambda_handler(event, context):
-    """
-    Based upon the queueSpace parameter, this will move up to that many file into the PCA audio bucket, but
-    only up to a maximum number as specified by the dripRate - this ensures that we don't overload they system
-    """
+
     # Load our event
     sfData = copy.deepcopy(event)
     filesLimit = sfData["filesLimit"]
@@ -42,6 +48,7 @@ def lambda_handler(event, context):
     sfData.pop("queueSpace", None)
     return sfData
 
+
 if __name__ == "__main__":
     event = {
         "sourceBucket": "pca-bulk-upload",
diff --git a/pca-server/src/pca/pca-aws-sf-bulk-queue-space.py b/pca-server/src/pca/pca-aws-sf-bulk-queue-space.py
@@ -1,6 +1,16 @@
+"""
+This python function is part of the bulk files workflow.  Checks the current state of the Transcribe job queue,
+taking into account running and queued jobs.  It then returns the calculated head-space in the queue that the
+Bulk process is able to use.  If any of the API calls to Transcribe or S3 get throttled then we say the queue
+is full this cycle and carry on
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+"""
 import copy
 import boto3
 
+
 def countTranscribeJobsInState(status, client, filesLimit):
     """
     Queries Transcribe for the number of jobs with the given status.  If there are more than 100
@@ -15,12 +25,9 @@ def countTranscribeJobsInState(status, client, filesLimit):
 
     return found
 
+
 def lambda_handler(event, context):
-    """
-    Checks the current state of the Transcribe job queue, taking into account running and queued jobs.
-    It then returns the calculated head-space in the queue that the Bulk process is able to use.  If any
-    of the API calls to Transcribe or S3 get throttled then we say the queue is full this cycle and carry on
-    """
+
     # Load our event, but we no longer need "filesToMove"
     sfData = copy.deepcopy(event)
     filesLimit = sfData["filesLimit"]
@@ -41,6 +48,7 @@ def lambda_handler(event, context):
     sfData["queueSpace"] = max(0, (filesLimit - found))
     return sfData
 
+
 if __name__ == "__main__":
     event = {
         ''
diff --git a/pca-server/src/pca/pca-aws-sf-get-detected-language.py b/pca-server/src/pca/pca-aws-sf-get-detected-language.py
@@ -1,3 +1,11 @@
+"""
+This python function is part of the main processing workflow.  It picks out the result of a transcription job
+and extracts the languag code.  This is only used on jobs that were started on a short audio clip with the
+sole purpose of language identification.
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+"""
 from urllib.parse import urlparse
 import boto3
 import copy
@@ -39,6 +47,7 @@ def lambda_handler(event, context):
     sfData["langCode"] = transcribeJobInfo["LanguageCode"]
     return sfData
 
+
 # Main entrypoint for testing
 if __name__ == "__main__":
     event = {
diff --git a/pca-server/src/pca/pca-aws-sf-language-detection.py b/pca-server/src/pca/pca-aws-sf-language-detection.py
@@ -1,3 +1,11 @@
+"""
+This python function is part of the main processing workflow.  It will create a 30-second clip of our original
+audio file and submit it to standard Amazon Transcribe, on the understanding that the next workflow step
+is interested in the detected language code that this job generates and not the transcript of the clip
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+"""
 import copy
 import boto3
 import pcaconfiguration as cf
diff --git a/pca-server/src/pca/pca-aws-sf-process-turn-by-turn.py b/pca-server/src/pca/pca-aws-sf-process-turn-by-turn.py
@@ -1,6 +1,9 @@
 """
-Parses the output from an Amazon Transcribe job into turn-by-turn
-speech segments with sentiment analysis scores from Amazon Comprehend
+This python function is part of the main processing workflow.  Parses the output from an Amazon Transcribe job into
+turn-by-turn speech segments with sentiment analysis scores from Amazon Comprehend
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
 """
 from pathlib import Path
 from datetime import datetime
@@ -408,7 +411,7 @@ def create_combined_tca_graphic(self):
 
         # Upload the graphic to S3
         s3Client = boto3.client('s3')
-        object_key = cf.appConfig[cf.CONF_PREFIX_PARSED_RESULTS] + "/tcaImagery/" + base_filename
+        object_key = "tcaImagery/" + base_filename
         s3Client.upload_file(chart_filename, cf.appConfig[cf.CONF_S3BUCKET_OUTPUT], object_key)
 
         # Remove the local file and return our S3 URL so that the UI can create signed URLs for browser rendering
@@ -1544,9 +1547,9 @@ def lambda_handler(event, context):
         # "key": "originalAudio/stereo.mp3",
         # "apiMode": "analytics",
         # "jobName": "stereo.mp3",
-        "key": "originalAudio/example-call.wav",
+        "key": "originalAudio/Auto1_GUID_001_AGENT_AndrewK_DT_2021-12-01T07-55-51.wav",
         "apiMode": "analytics",
-        "jobName": "example-call.wav",
+        "jobName": "Auto1_GUID_001_AGENT_AndrewK_DT_2021-12-01T07-55-51.wav",
         "langCode": "en-US",
         "transcribeStatus": "COMPLETED"
     }
diff --git a/pca-server/src/pca/pca-aws-sf-start-transcribe-job.py b/pca-server/src/pca/pca-aws-sf-start-transcribe-job.py
@@ -1,3 +1,12 @@
+"""
+This python function is part of the main processing workflow.  It will start a job in the Amazon Transcribe service,
+using whatever configuration parameters are set.  It handles all of the cross-validation of parameters, and takes
+into account the audio format - it will then degrade certain feature requests; e.g. if you have configured the app
+to do channel-separated audio jobs but the audio file is mono then it switch to speaker-separation mode.
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+"""
 import copy
 import boto3
 import subprocess
@@ -50,6 +59,7 @@ def delete_existing_job(job_name, transcribe, api_mode):
         # If the job has already been deleted then we don't need to take any action
         pass
 
+
 def count_audio_channels(bucket, key):
     '''
     Examines an audio file using the FFPROBE utility to determine the number of audio channels in the file.  If
diff --git a/pca-server/src/pca/pca-aws-sf-transcribe-failed.py b/pca-server/src/pca/pca-aws-sf-transcribe-failed.py
@@ -1,7 +1,16 @@
+"""
+This python function is part of the main processing workflow.  It handles the clean-up for when the workflow fails
+for expected reasons, such as being unable to perform Language Identification, and clears up or moves any resources
+associated with this execution.
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+"""
 import boto3
 import pcaconfiguration as cf
 import pcacommon
 
+
 def lambda_handler(event, context):
     """
     When a file has failed to transcribe then we need to do two things:
@@ -35,6 +44,7 @@ def lambda_handler(event, context):
     # Return our input data as the final result
     return event
 
+
 if __name__ == "__main__":
     event = {
         "bucket": "ajk-call-analytics-demo",
diff --git a/pca-server/src/pca/pca-aws-sf-wait-for-transcribe-notification.py b/pca-server/src/pca/pca-aws-sf-wait-for-transcribe-notification.py
@@ -1,3 +1,11 @@
+"""
+This python function is part of the main processing workflow.  It is called when a Transcribe job is started, and it
+will create an entry in a DynamoDB table that holds some job information and the Step Functions task token.  The Step
+Function should then wait for another task to read this task token from DynamoDB and resume the execution.
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+"""
 import json
 import boto3
 import os
diff --git a/pca-server/src/pca/pca-transcribe-eventbridge.py b/pca-server/src/pca/pca-transcribe-eventbridge.py
@@ -1,3 +1,11 @@
+"""
+This python function is part of the main processing workflow.  It is called by Event Bridge once a Transcribe job
+has completed.  It will look up that job record in DynamoDB, including the Step Functions task token associated with
+that job, extract the job-status from the relevant Transcribe API and then resume the Step Function execution
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+"""
 import json
 import boto3
 import time
@@ -7,6 +15,7 @@
 # Total number of retry attempts to make
 RETRY_LIMIT = 2
 
+
 def lambda_handler(event, context):
     # Our tracking table name is an environment variable
     DDB_TRACKING_TABLE = os.environ["TableName"]
diff --git a/pca-server/src/pca/pcacommon.py b/pca-server/src/pca/pcacommon.py
@@ -1,3 +1,10 @@
+"""
+This python function is part of the main processing workflow.  It contains a number of common functions that other
+python functions in this application need to share.
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+"""
 import os
 import boto3
 import pcaconfiguration as cf
diff --git a/pca-server/src/pca/pcaconfiguration.py b/pca-server/src/pca/pcaconfiguration.py
@@ -1,3 +1,11 @@
+"""
+This python function is part of the main processing workflow.  It loads in all of the configuration parameters
+from the SSM Parameter Store and makes them available to all other python functions.  It also includes some helper
+functions to check some logical conditions of some of these configuration parameters.
+
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+"""
 import boto3
 
 # Parameter Store Field Names used by main workflow
@@ -57,6 +65,7 @@
 # Configuration data
 appConfig = {}
 
+
 def generateJobName(key):
     """
     Transcribe job names cannot contains spaces.  This takes in an S3
@@ -72,6 +81,7 @@ def generateJobName(key):
 
     return response
 
+
 def extractParameters(ssmResponse, useTagName):
     """
     Picks out the Parameter Store results and appends the values to our
@@ -91,6 +101,7 @@ def extractParameters(ssmResponse, useTagName):
         else:
             appConfig[paramName] = ""
 
+
 def loadConfiguration():
     """
     Loads in the configuration values from Parameter Store.  Bulk loads them in batches of 10,
diff --git a/pca-server/src/pca/pcakendrasearch.py b/pca-server/src/pca/pcakendrasearch.py
@@ -1,3 +1,7 @@
+"""
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+"""
 import os
 import json
 import boto3
@@ -9,6 +13,7 @@
 KENDRA = boto3.client('kendra')
 S3 = boto3.client('s3')
 
+
 def prepare_transcript_standard(transcript):
     print(f"prepare_transcript_callanalytics(...)")
     items = transcript["results"]["items"]
@@ -30,6 +35,7 @@ def prepare_transcript_standard(transcript):
     out = textwrap.fill(txt, width=70)
     return out
 
+
 def prepare_transcript_callanalytics(transcript):
     print(f"prepare_transcript_callanalytics(...)")
     turns = transcript["Transcript"]
@@ -54,6 +60,7 @@ def prepare_transcript_callanalytics(transcript):
     out = textwrap.fill(txt, width=70)
     return out    
 
+
 def prepare_transcript(transcript_file):
     """
     Parses the output from the Transcribe job, inserting time markers at the start of each sentence.
@@ -138,6 +145,7 @@ def durationBucket(durationStr):
     else:
         return "over 10 min"
 
+
 def put_kendra_document(indexId, analysisUri, conversationAnalytics, text):
     """
     index the prepared transcript in Kendra, setting all the document index attributes to support