|
| 1 | +import oci |
| 2 | +import yaml |
| 3 | +import argparse |
| 4 | +import os |
| 5 | +import time |
| 6 | +from pathlib import Path |
| 7 | + |
| 8 | +# --- Helper Functions --- |
| 9 | + |
| 10 | +def load_config(): |
| 11 | + """Load configuration from config.yaml""" |
| 12 | + with open('config.yaml', 'r') as f: |
| 13 | + return yaml.safe_load(f) |
| 14 | + |
| 15 | +def upload_to_object_storage(object_storage_client, namespace, bucket_name, file_path): |
| 16 | + """Upload file to OCI Object Storage and return its name.""" |
| 17 | + file_name = os.path.basename(file_path) |
| 18 | + print(f"INFO: Uploading '{file_name}' to bucket '{bucket_name}'...") |
| 19 | + with open(file_path, 'rb') as f: |
| 20 | + object_storage_client.put_object(namespace, bucket_name, file_name, f) |
| 21 | + print("INFO: Upload complete.") |
| 22 | + return file_name |
| 23 | + |
| 24 | +def wait_for_job_completion(client, job_id, compartment_id, check_interval=30): |
| 25 | + """Polls the status of a job until it completes or fails.""" |
| 26 | + while True: |
| 27 | + try: |
| 28 | + get_job_response = client.get_job(job_id=job_id) |
| 29 | + status = get_job_response.data.lifecycle_state |
| 30 | + |
| 31 | + if status == oci.ai_language.models.Job.LIFECYCLE_STATE_SUCCEEDED: |
| 32 | + print("INFO: Job succeeded.") |
| 33 | + return True |
| 34 | + elif status in [ |
| 35 | + oci.ai_language.models.Job.LIFECYCLE_STATE_FAILED, |
| 36 | + oci.ai_language.models.Job.LIFECYCLE_STATE_CANCELED, |
| 37 | + ]: |
| 38 | + print(f"ERROR: Job failed with status: {status}") |
| 39 | + return False |
| 40 | + else: |
| 41 | + print(f"INFO: Job status: {status}. Waiting {check_interval} seconds...") |
| 42 | + time.sleep(check_interval) |
| 43 | + except oci.exceptions.ServiceError as e: |
| 44 | + print(f"ERROR: Error checking job status: {e}") |
| 45 | + return False |
| 46 | + |
| 47 | +# --- Model Discovery (with caching) --- |
| 48 | +model_cache = {} |
| 49 | + |
| 50 | +def get_translation_model_id(language_client, tenancy_id, source_lang, target_lang): |
| 51 | + """Finds the OCID of the pre-trained translation model for a given language pair.""" |
| 52 | + # OCI uses 2-letter codes for this model format, e.g., 'en-es' |
| 53 | + source = source_lang.split('-')[0] |
| 54 | + target = target_lang.split('-')[0] |
| 55 | + model_name = f"Pre-trained Translation model {source}-{target}" |
| 56 | + |
| 57 | + if model_name in model_cache: |
| 58 | + return model_cache[model_name] |
| 59 | + |
| 60 | + print(f"INFO: Searching for model '{model_name}'...") |
| 61 | + try: |
| 62 | + # Pre-trained models are in the root compartment of the tenancy |
| 63 | + list_models_response = language_client.list_models(compartment_id=tenancy_id) |
| 64 | + |
| 65 | + for model in list_models_response.data.items: |
| 66 | + if model.display_name == model_name: |
| 67 | + print(f"INFO: Found model ID: {model.id}") |
| 68 | + model_cache[model_name] = model.id |
| 69 | + return model.id |
| 70 | + |
| 71 | + print(f"ERROR: Pre-trained translation model not found for {source_lang} -> {target_lang}") |
| 72 | + return None |
| 73 | + except oci.exceptions.ServiceError as e: |
| 74 | + print(f"ERROR: Could not list models. Check permissions for the root compartment. {e}") |
| 75 | + return None |
| 76 | + |
| 77 | +# --- Main Translation Logic --- |
| 78 | + |
| 79 | +def translate_srt_async(language_client, object_storage_client, config_yaml, model_id, input_file): |
| 80 | + """ |
| 81 | + Creates an asynchronous job to translate a file from Object Storage. |
| 82 | + """ |
| 83 | + namespace = config_yaml['speech']['namespace'] |
| 84 | + bucket_name = config_yaml['speech']['bucket_name'] |
| 85 | + compartment_id = config_yaml['language']['compartment_id'] |
| 86 | + target_lang = model_id.split('-')[-1] # Infer from model OCID if needed, or pass as arg |
| 87 | + |
| 88 | + try: |
| 89 | + # 1. Upload the source file to Object Storage |
| 90 | + object_name = upload_to_object_storage(object_storage_client, namespace, bucket_name, input_file) |
| 91 | + |
| 92 | + # 2. Define input and output locations in Object Storage |
| 93 | + input_location = oci.ai_language.models.ObjectStorageFileNameLocation( |
| 94 | + namespace_name=namespace, |
| 95 | + bucket_name=bucket_name, |
| 96 | + object_names=[object_name] |
| 97 | + ) |
| 98 | + |
| 99 | + output_location = oci.ai_language.models.ObjectPrefixOutputLocation( |
| 100 | + namespace_name=namespace, |
| 101 | + bucket_name=bucket_name, |
| 102 | + prefix=f"translated_output/{Path(input_file).stem}/" |
| 103 | + ) |
| 104 | + |
| 105 | + # 3. Define the job details, referencing the pre-trained model ID |
| 106 | + create_job_details = oci.ai_language.models.CreateJobDetails( |
| 107 | + display_name=f"Translate_{object_name}_to_{target_lang}", |
| 108 | + compartment_id=compartment_id, |
| 109 | + input_location=input_location, |
| 110 | + output_location=output_location, |
| 111 | + model_metadata_details=[ |
| 112 | + oci.ai_language.models.ModelMetadataDetails(model_id=model_id) |
| 113 | + ] |
| 114 | + ) |
| 115 | + |
| 116 | + # 4. Create the job |
| 117 | + create_job_response = language_client.create_job(create_job_details=create_job_details) |
| 118 | + job_id = create_job_response.data.id |
| 119 | + print(f"INFO: Job created with ID: {job_id}") |
| 120 | + |
| 121 | + # 5. Wait for the job to complete |
| 122 | + return wait_for_job_completion(language_client, job_id, compartment_id) |
| 123 | + |
| 124 | + except oci.exceptions.ServiceError as e: |
| 125 | + print(f"ERROR: Failed to create translation job: {e}") |
| 126 | + return False |
| 127 | + |
| 128 | + |
| 129 | +def main(): |
| 130 | + SUPPORTED_LANGUAGES = { |
| 131 | + 'ar': 'Arabic', 'hr': 'Croatian', 'cs': 'Czech', 'da': 'Danish', |
| 132 | + 'nl': 'Dutch', 'en': 'English', 'fi': 'Finnish', 'fr': 'French', |
| 133 | + 'fr-CA': 'French Canadian', 'de': 'German', 'el': 'Greek', |
| 134 | + 'he': 'Hebrew', 'hu': 'Hungarian', 'it': 'Italian', 'ja': 'Japanese', |
| 135 | + 'ko': 'Korean', 'no': 'Norwegian', 'pl': 'Polish', 'pt': 'Portuguese', |
| 136 | + 'pt-BR': 'Portuguese Brazilian', 'ro': 'Romanian', 'ru': 'Russian', |
| 137 | + 'zh-CN': 'Simplified Chinese', 'sk': 'Slovak', 'sl': 'Slovenian', |
| 138 | + 'es': 'Spanish', 'sv': 'Swedish', 'th': 'Thai', 'zh-TW': 'Traditional Chinese', |
| 139 | + 'tr': 'Turkish', 'vi': 'Vietnamese' |
| 140 | + } |
| 141 | + parser = argparse.ArgumentParser(description='Translate SRT files using OCI Language (Async Object Storage Method)') |
| 142 | + parser.add_argument('--input-file', required=True, help='Input SRT file path') |
| 143 | + parser.add_argument('--source-lang', default='en', help='Source language code (e.g., en)') |
| 144 | + parser.add_argument('--target-langs', nargs='+', help='Target language codes (e.g., es fr de)') |
| 145 | + args = parser.parse_args() |
| 146 | + |
| 147 | + if not os.path.exists(args.input_file): |
| 148 | + print(f"Error: Input file {args.input_file} not found") |
| 149 | + return |
| 150 | + |
| 151 | + config_yaml = load_config() |
| 152 | + profile_name = config_yaml.get("profile", "DEFAULT") |
| 153 | + try: |
| 154 | + oci_config = oci.config.from_file(profile_name=profile_name) |
| 155 | + tenancy_id = oci_config.get("tenancy") |
| 156 | + print(f"INFO: Loaded OCI profile '{profile_name}' for tenancy '{tenancy_id}'") |
| 157 | + except Exception as e: |
| 158 | + print(f"ERROR: Failed to load OCI configuration: {e}") |
| 159 | + return |
| 160 | + |
| 161 | + language_client = oci.ai_language.AIServiceLanguageClient(oci_config) |
| 162 | + object_storage_client = oci.object_storage.ObjectStorageClient(oci_config) |
| 163 | + |
| 164 | + target_langs = args.target_langs if args.target_langs else SUPPORTED_LANGUAGES.keys() |
| 165 | + |
| 166 | + for lang_code in target_langs: |
| 167 | + if lang_code == args.source_lang: |
| 168 | + continue |
| 169 | + print("-" * 50) |
| 170 | + print(f"Starting translation process for {args.source_lang} -> {lang_code}") |
| 171 | + |
| 172 | + # 1. Find the correct pre-trained model for this language pair |
| 173 | + model_id = get_translation_model_id(language_client, tenancy_id, args.source_lang, lang_code) |
| 174 | + |
| 175 | + if model_id: |
| 176 | + # 2. If model is found, start the asynchronous translation job |
| 177 | + translate_srt_async( |
| 178 | + language_client, |
| 179 | + object_storage_client, |
| 180 | + config_yaml, |
| 181 | + model_id, |
| 182 | + args.input_file |
| 183 | + ) |
| 184 | + print("-" * 50) |
| 185 | + |
| 186 | +if __name__ == "__main__": |
| 187 | + main() |
0 commit comments