Skip to content

Commit 19e1afe

Browse files
committed
first basic translation
Signed-off-by: Keenan Kalra <[email protected]>
1 parent f8b8250 commit 19e1afe

File tree

6 files changed

+383
-17
lines changed

6 files changed

+383
-17
lines changed

oci-subtitle-translation/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ The solution combines two powerful OCI services:
1010

1111
This automated approach significantly reduces the time and effort required to create multilingual subtitles, making content more accessible to a global audience.
1212

13+
PUT IMAGE HERE
14+
join oracle profile
1315
## 0. Prerequisites and setup
1416

1517
### Prerequisites
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# download_srt.py
2+
"""
3+
Download a single SRT file from OCI Object Storage.
4+
5+
Usage (conda env with python-oci-sdk installed):
6+
python download_srt.py --profile aisolutions --outfile test.mp3.srt
7+
"""
8+
import argparse
9+
import oci
10+
11+
NAMESPACE = "axytmnxp84kg"
12+
BUCKET = "SubtitleTranslatorSystem"
13+
OBJECT_NAME = (
14+
"transcriptions/Test.mp3/"
15+
"job-amaaaaaaywfcc6aakabq6orrvcofpfoohku2tixcwjoxxlqipiru3u6qptra/"
16+
"axytmnxp84kg_SubtitleTranslatorSystem_Test.mp3.srt"
17+
)
18+
19+
def main(profile: str, outfile: str) -> None:
20+
# Load config for the chosen profile
21+
config = oci.config.from_file(profile_name=profile)
22+
obj_client = oci.object_storage.ObjectStorageClient(config)
23+
24+
with open(outfile, "wb") as fp:
25+
get_resp = obj_client.get_object(
26+
namespace_name=NAMESPACE,
27+
bucket_name=BUCKET,
28+
object_name=OBJECT_NAME,
29+
)
30+
for chunk in get_resp.data.raw.stream(1024 * 1024, decode_content=False):
31+
fp.write(chunk)
32+
33+
print(f"Downloaded → {outfile}")
34+
35+
if __name__ == "__main__":
36+
parser = argparse.ArgumentParser()
37+
parser.add_argument("--profile", default="aisolutions",
38+
help="OCI CLI profile name (defaults to 'aisolutions')")
39+
parser.add_argument("--outfile", default="test.mp3.srt",
40+
help="Local output filename")
41+
args = parser.parse_args()
42+
main(args.profile, args.outfile)

oci-subtitle-translation/generate_srt_from_audio.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,4 +140,4 @@ def load_config():
140140

141141
except Exception as e:
142142
log_step(f"Failed to create transcription job: {str(e)}", True)
143-
sys.exit(1)
143+
sys.exit(1)
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import oci
2+
import yaml
3+
import argparse
4+
import os
5+
from pathlib import Path
6+
7+
def load_config():
8+
"""Load configuration from config.yaml"""
9+
with open('config.yaml', 'r') as f:
10+
return yaml.safe_load(f)
11+
12+
def translate_text(language_client, text, source_lang, target_lang, compartment_id):
13+
"""
14+
Translates a string of text using a direct, synchronous API call.
15+
"""
16+
try:
17+
# The source language is specified inside each document.
18+
documents = [oci.ai_language.models.TextDocument(
19+
key="1",
20+
text=text,
21+
language_code=source_lang
22+
)]
23+
24+
# Create the details object for the synchronous batch call.
25+
batch_details = oci.ai_language.models.BatchLanguageTranslationDetails(
26+
documents=documents,
27+
target_language_code=target_lang,
28+
compartment_id=compartment_id
29+
)
30+
31+
# Make the API call. This is a blocking call and returns the result directly.
32+
response = language_client.batch_language_translation(
33+
batch_language_translation_details=batch_details
34+
)
35+
36+
# Check for success and return the translated text.
37+
if response.status == 200 and response.data.documents:
38+
print(f"Successfully translated to {target_lang}")
39+
return response.data.documents[0].translated_text
40+
else:
41+
print(f"Error during translation to {target_lang}: {response.data}")
42+
return None
43+
44+
except oci.exceptions.ServiceError as e:
45+
print(f"Error translating to {target_lang}: {e}")
46+
return None
47+
48+
def main():
49+
SUPPORTED_LANGUAGES = {
50+
'ar': 'Arabic', 'hr': 'Croatian', 'cs': 'Czech', 'da': 'Danish',
51+
'nl': 'Dutch', 'en': 'English', 'fi': 'Finnish', 'fr': 'French',
52+
'fr-CA': 'French Canadian', 'de': 'German', 'el': 'Greek',
53+
'he': 'Hebrew', 'hu': 'Hungarian', 'it': 'Italian', 'ja': 'Japanese',
54+
'ko': 'Korean', 'no': 'Norwegian', 'pl': 'Polish', 'pt': 'Portuguese',
55+
'pt-BR': 'Portuguese Brazilian', 'ro': 'Romanian', 'ru': 'Russian',
56+
'zh-CN': 'Simplified Chinese', 'sk': 'Slovak', 'sl': 'Slovenian',
57+
'es': 'Spanish', 'sv': 'Swedish', 'th': 'Thai', 'zh-TW': 'Traditional Chinese',
58+
'tr': 'Turkish', 'vi': 'Vietnamese'
59+
}
60+
61+
parser = argparse.ArgumentParser(description='Translate SRT files using OCI Language')
62+
parser.add_argument('--input-file', required=True, help='Input SRT file path')
63+
parser.add_argument('--source-lang', default='en', help='Source language code')
64+
parser.add_argument('--target-langs', nargs='+', help='Target language codes (space-separated)')
65+
args = parser.parse_args()
66+
67+
input_path = Path(args.input_file)
68+
if not input_path.exists():
69+
print(f"Error: Input file {args.input_file} not found")
70+
return
71+
72+
# Load YAML configuration
73+
config_yaml = load_config()
74+
language_compartment_id = config_yaml['language']['compartment_id']
75+
76+
# Load OCI config from the profile specified in the YAML
77+
profile_name = config_yaml.get("profile", "DEFAULT")
78+
try:
79+
oci_config = oci.config.from_file(profile_name=profile_name)
80+
region = oci_config.get("region", "unknown")
81+
print(f"INFO: Loaded OCI profile '{profile_name}' (region '{region}')")
82+
except Exception as e:
83+
print(f"ERROR: Failed to load OCI configuration: {e}")
84+
return
85+
86+
# Initialize client
87+
language_client = oci.ai_language.AIServiceLanguageClient(oci_config)
88+
89+
# Read the content of the source SRT file
90+
source_text = input_path.read_text(encoding='utf-8')
91+
92+
target_langs = args.target_langs if args.target_langs else SUPPORTED_LANGUAGES.keys()
93+
94+
for lang_code in target_langs:
95+
if lang_code not in SUPPORTED_LANGUAGES:
96+
print(f"Warning: Unsupported language code '{lang_code}', skipping...")
97+
continue
98+
99+
if lang_code != args.source_lang:
100+
print(f"Translating to {SUPPORTED_LANGUAGES[lang_code]} ({lang_code})...")
101+
102+
translated_text = translate_text(
103+
language_client,
104+
source_text,
105+
args.source_lang,
106+
lang_code,
107+
language_compartment_id
108+
)
109+
110+
if translated_text:
111+
# Save the translated text to a new file
112+
output_filename = f"{lang_code}_{input_path.name}"
113+
Path(output_filename).write_text(translated_text, encoding='utf-8')
114+
print(f"Saved translated file to: {output_filename}")
115+
116+
117+
if __name__ == "__main__":
118+
main()
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
import oci
2+
import yaml
3+
import argparse
4+
import os
5+
import time
6+
from pathlib import Path
7+
8+
# --- Helper Functions ---
9+
10+
def load_config():
11+
"""Load configuration from config.yaml"""
12+
with open('config.yaml', 'r') as f:
13+
return yaml.safe_load(f)
14+
15+
def upload_to_object_storage(object_storage_client, namespace, bucket_name, file_path):
16+
"""Upload file to OCI Object Storage and return its name."""
17+
file_name = os.path.basename(file_path)
18+
print(f"INFO: Uploading '{file_name}' to bucket '{bucket_name}'...")
19+
with open(file_path, 'rb') as f:
20+
object_storage_client.put_object(namespace, bucket_name, file_name, f)
21+
print("INFO: Upload complete.")
22+
return file_name
23+
24+
def wait_for_job_completion(client, job_id, compartment_id, check_interval=30):
25+
"""Polls the status of a job until it completes or fails."""
26+
while True:
27+
try:
28+
get_job_response = client.get_job(job_id=job_id)
29+
status = get_job_response.data.lifecycle_state
30+
31+
if status == oci.ai_language.models.Job.LIFECYCLE_STATE_SUCCEEDED:
32+
print("INFO: Job succeeded.")
33+
return True
34+
elif status in [
35+
oci.ai_language.models.Job.LIFECYCLE_STATE_FAILED,
36+
oci.ai_language.models.Job.LIFECYCLE_STATE_CANCELED,
37+
]:
38+
print(f"ERROR: Job failed with status: {status}")
39+
return False
40+
else:
41+
print(f"INFO: Job status: {status}. Waiting {check_interval} seconds...")
42+
time.sleep(check_interval)
43+
except oci.exceptions.ServiceError as e:
44+
print(f"ERROR: Error checking job status: {e}")
45+
return False
46+
47+
# --- Model Discovery (with caching) ---
48+
model_cache = {}
49+
50+
def get_translation_model_id(language_client, tenancy_id, source_lang, target_lang):
51+
"""Finds the OCID of the pre-trained translation model for a given language pair."""
52+
# OCI uses 2-letter codes for this model format, e.g., 'en-es'
53+
source = source_lang.split('-')[0]
54+
target = target_lang.split('-')[0]
55+
model_name = f"Pre-trained Translation model {source}-{target}"
56+
57+
if model_name in model_cache:
58+
return model_cache[model_name]
59+
60+
print(f"INFO: Searching for model '{model_name}'...")
61+
try:
62+
# Pre-trained models are in the root compartment of the tenancy
63+
list_models_response = language_client.list_models(compartment_id=tenancy_id)
64+
65+
for model in list_models_response.data.items:
66+
if model.display_name == model_name:
67+
print(f"INFO: Found model ID: {model.id}")
68+
model_cache[model_name] = model.id
69+
return model.id
70+
71+
print(f"ERROR: Pre-trained translation model not found for {source_lang} -> {target_lang}")
72+
return None
73+
except oci.exceptions.ServiceError as e:
74+
print(f"ERROR: Could not list models. Check permissions for the root compartment. {e}")
75+
return None
76+
77+
# --- Main Translation Logic ---
78+
79+
def translate_srt_async(language_client, object_storage_client, config_yaml, model_id, input_file):
80+
"""
81+
Creates an asynchronous job to translate a file from Object Storage.
82+
"""
83+
namespace = config_yaml['speech']['namespace']
84+
bucket_name = config_yaml['speech']['bucket_name']
85+
compartment_id = config_yaml['language']['compartment_id']
86+
target_lang = model_id.split('-')[-1] # Infer from model OCID if needed, or pass as arg
87+
88+
try:
89+
# 1. Upload the source file to Object Storage
90+
object_name = upload_to_object_storage(object_storage_client, namespace, bucket_name, input_file)
91+
92+
# 2. Define input and output locations in Object Storage
93+
input_location = oci.ai_language.models.ObjectStorageFileNameLocation(
94+
namespace_name=namespace,
95+
bucket_name=bucket_name,
96+
object_names=[object_name]
97+
)
98+
99+
output_location = oci.ai_language.models.ObjectPrefixOutputLocation(
100+
namespace_name=namespace,
101+
bucket_name=bucket_name,
102+
prefix=f"translated_output/{Path(input_file).stem}/"
103+
)
104+
105+
# 3. Define the job details, referencing the pre-trained model ID
106+
create_job_details = oci.ai_language.models.CreateJobDetails(
107+
display_name=f"Translate_{object_name}_to_{target_lang}",
108+
compartment_id=compartment_id,
109+
input_location=input_location,
110+
output_location=output_location,
111+
model_metadata_details=[
112+
oci.ai_language.models.ModelMetadataDetails(model_id=model_id)
113+
]
114+
)
115+
116+
# 4. Create the job
117+
create_job_response = language_client.create_job(create_job_details=create_job_details)
118+
job_id = create_job_response.data.id
119+
print(f"INFO: Job created with ID: {job_id}")
120+
121+
# 5. Wait for the job to complete
122+
return wait_for_job_completion(language_client, job_id, compartment_id)
123+
124+
except oci.exceptions.ServiceError as e:
125+
print(f"ERROR: Failed to create translation job: {e}")
126+
return False
127+
128+
129+
def main():
130+
SUPPORTED_LANGUAGES = {
131+
'ar': 'Arabic', 'hr': 'Croatian', 'cs': 'Czech', 'da': 'Danish',
132+
'nl': 'Dutch', 'en': 'English', 'fi': 'Finnish', 'fr': 'French',
133+
'fr-CA': 'French Canadian', 'de': 'German', 'el': 'Greek',
134+
'he': 'Hebrew', 'hu': 'Hungarian', 'it': 'Italian', 'ja': 'Japanese',
135+
'ko': 'Korean', 'no': 'Norwegian', 'pl': 'Polish', 'pt': 'Portuguese',
136+
'pt-BR': 'Portuguese Brazilian', 'ro': 'Romanian', 'ru': 'Russian',
137+
'zh-CN': 'Simplified Chinese', 'sk': 'Slovak', 'sl': 'Slovenian',
138+
'es': 'Spanish', 'sv': 'Swedish', 'th': 'Thai', 'zh-TW': 'Traditional Chinese',
139+
'tr': 'Turkish', 'vi': 'Vietnamese'
140+
}
141+
parser = argparse.ArgumentParser(description='Translate SRT files using OCI Language (Async Object Storage Method)')
142+
parser.add_argument('--input-file', required=True, help='Input SRT file path')
143+
parser.add_argument('--source-lang', default='en', help='Source language code (e.g., en)')
144+
parser.add_argument('--target-langs', nargs='+', help='Target language codes (e.g., es fr de)')
145+
args = parser.parse_args()
146+
147+
if not os.path.exists(args.input_file):
148+
print(f"Error: Input file {args.input_file} not found")
149+
return
150+
151+
config_yaml = load_config()
152+
profile_name = config_yaml.get("profile", "DEFAULT")
153+
try:
154+
oci_config = oci.config.from_file(profile_name=profile_name)
155+
tenancy_id = oci_config.get("tenancy")
156+
print(f"INFO: Loaded OCI profile '{profile_name}' for tenancy '{tenancy_id}'")
157+
except Exception as e:
158+
print(f"ERROR: Failed to load OCI configuration: {e}")
159+
return
160+
161+
language_client = oci.ai_language.AIServiceLanguageClient(oci_config)
162+
object_storage_client = oci.object_storage.ObjectStorageClient(oci_config)
163+
164+
target_langs = args.target_langs if args.target_langs else SUPPORTED_LANGUAGES.keys()
165+
166+
for lang_code in target_langs:
167+
if lang_code == args.source_lang:
168+
continue
169+
print("-" * 50)
170+
print(f"Starting translation process for {args.source_lang} -> {lang_code}")
171+
172+
# 1. Find the correct pre-trained model for this language pair
173+
model_id = get_translation_model_id(language_client, tenancy_id, args.source_lang, lang_code)
174+
175+
if model_id:
176+
# 2. If model is found, start the asynchronous translation job
177+
translate_srt_async(
178+
language_client,
179+
object_storage_client,
180+
config_yaml,
181+
model_id,
182+
args.input_file
183+
)
184+
print("-" * 50)
185+
186+
if __name__ == "__main__":
187+
main()

0 commit comments

Comments
 (0)