From 4deca0fa94fe71fbd2a2167305351bba7aa3166a Mon Sep 17 00:00:00 2001 From: rgraber Date: Wed, 22 Oct 2025 11:40:14 -0400 Subject: [PATCH 01/18] temp: initial commit --- kobo/apps/subsequences/tests/test_models.py | 1 + .../subsequences/tests/test_versioning.py | 33 +++++++++++++++++++ kobo/apps/subsequences/utils/versioning.py | 22 +++++++++++++ kobo/settings/base.py | 2 +- 4 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 kobo/apps/subsequences/tests/test_versioning.py diff --git a/kobo/apps/subsequences/tests/test_models.py b/kobo/apps/subsequences/tests/test_models.py index 6e49e1c530..81bf26f267 100644 --- a/kobo/apps/subsequences/tests/test_models.py +++ b/kobo/apps/subsequences/tests/test_models.py @@ -337,3 +337,4 @@ def test_revise_data_raise_error_wrong_question_name(self): }, }, ) + diff --git a/kobo/apps/subsequences/tests/test_versioning.py b/kobo/apps/subsequences/tests/test_versioning.py new file mode 100644 index 0000000000..3166bbcf7a --- /dev/null +++ b/kobo/apps/subsequences/tests/test_versioning.py @@ -0,0 +1,33 @@ +from django.test import TestCase + +class TestVersioning(TestCase): + def test_migrate_submission_extra_to_supplemental(self): + old_version = {'Audio_question': {'googlets': {'languageCode': 'en', + 'regionCode': None, + 'status': 'complete', + 'value': 'This is audio that I am trying to ' + 'transcribe.'}, + 'googletx': {'languageCode': 'es', + 'source': 'en', + 'status': 'complete', + 'value': 'Este es un audio que estoy ' + 'intentando transcribir.'}, + 'qual': [{'type': 'qual_text', + 'uuid': 'b8188424-6249-4168-8137-7d9fab62ae3c', + 'val': 'Trying to transcribe audio'}], + 'transcript': {'dateCreated': '2025-10-22 14:30:24', + 'dateModified': '2025-10-22 14:30:24', + 'languageCode': 'en', + 'revisions': [{}], + 'value': 'This is audio that I am trying to ' + 'transcribe.'}, + 'translation': {'es': {'dateCreated': '2025-10-22T14:30:38Z', + 'dateModified': '2025-10-22T14:30:38Z', + 'languageCode': 'es', + 'revisions': [], + 'value': 'Este es un audio que ' + 'estoy intentando ' + 'transcribir.'}}}} + + self.assertEqual(True, False) # add assertion here + diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index aba7b21852..8276e6ae38 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -13,6 +13,7 @@ def migrate_advanced_features(advanced_features: dict) -> dict | None: actionConfigs = migrated_advanced_features['_actionConfigs'] for key, value in advanced_features.items(): + print(f'{key=}, {value=}') if ( key == 'transcript' and value @@ -34,6 +35,7 @@ def migrate_advanced_features(advanced_features: dict) -> dict | None: ] if key == 'qual': + survey_qs = value['qual_survey'] raise NotImplementedError return migrated_advanced_features @@ -42,3 +44,23 @@ def migrate_advanced_features(advanced_features: dict) -> dict | None: def set_version(schema: dict) -> dict: schema['_version'] = SCHEMA_VERSIONS[0] return schema + +def migrate_submission_supplementals(supplemental_data:dict) -> dict: + if supplemental_data.get('_version', None) == SCHEMA_VERSIONS[0]: + return + supplemental = { + '_version': SCHEMA_VERSIONS[0], + } + for question_xpath, action_results in supplemental_data: + question_results_by_action = {} + for action, results in action_results: + if action == 'googlets': + pass + if action == 'googletx': + pass + if action == 'qual': + pass + if action == 'transcript': + pass + if action == 'translation': + pass diff --git a/kobo/settings/base.py b/kobo/settings/base.py index 03c4d13887..e64acfefe5 100644 --- a/kobo/settings/base.py +++ b/kobo/settings/base.py @@ -1587,7 +1587,7 @@ def dj_stripe_request_callback_method(): # Google Cloud Storage # Not fully supported as a generic storage backend -GS_BUCKET_NAME = env.str('GS_BUCKET_NAME', None) +GS_BUCKET_NAME = 'kobo-transcription-test' #env.str('GS_BUCKET_NAME', None) """ Django error logging configuration """ From 87ba5a47d508b45ab06f958c79765575c0fb1d48 Mon Sep 17 00:00:00 2001 From: rgraber Date: Wed, 22 Oct 2025 15:27:01 -0400 Subject: [PATCH 02/18] fixup!: stuff --- .../subsequences/tests/test_versioning.py | 101 ++++++++++++++++-- kobo/apps/subsequences/utils/versioning.py | 3 + 2 files changed, 94 insertions(+), 10 deletions(-) diff --git a/kobo/apps/subsequences/tests/test_versioning.py b/kobo/apps/subsequences/tests/test_versioning.py index 3166bbcf7a..6372b033c7 100644 --- a/kobo/apps/subsequences/tests/test_versioning.py +++ b/kobo/apps/subsequences/tests/test_versioning.py @@ -12,22 +12,103 @@ def test_migrate_submission_extra_to_supplemental(self): 'status': 'complete', 'value': 'Este es un audio que estoy ' 'intentando transcribir.'}, - 'qual': [{'type': 'qual_text', - 'uuid': 'b8188424-6249-4168-8137-7d9fab62ae3c', - 'val': 'Trying to transcribe audio'}], - 'transcript': {'dateCreated': '2025-10-22 14:30:24', - 'dateModified': '2025-10-22 14:30:24', + 'transcript': {'dateCreated': None, + 'dateModified': '2025-10-22 17:09:38', 'languageCode': 'en', - 'revisions': [{}], + 'revisions': [{'dateModified': '2025-10-22 ' + '14:30:24', + 'languageCode': 'en', + 'value': 'This is audio that ' + 'I am trying to ' + 'transcribe.'}, + {}], 'value': 'This is audio that I am trying to ' - 'transcribe.'}, + 'transcribe but i edited it.'}, 'translation': {'es': {'dateCreated': '2025-10-22T14:30:38Z', - 'dateModified': '2025-10-22T14:30:38Z', + 'dateModified': '2025-10-22T17:10:23Z', 'languageCode': 'es', - 'revisions': [], + 'revisions': [{'dateModified': '2025-10-22T14:30:38Z', + 'languageCode': 'es', + 'value': 'Este es un ' + 'audio que ' + 'estoy ' + 'intentando ' + 'transcribir.'}], 'value': 'Este es un audio que ' 'estoy intentando ' - 'transcribir.'}}}} + 'transcribir pero yo lo edité'}}}} + + new_version = { + '_version': '20250820', + 'Audio_question': { + 'automatic_transcription': { + '_dateCreated': '', + '_dateModified': '', + '_versions': [ + { + '_dateCreated': '', + '_dateAccepted': '', + '_uuid':'', + 'language': 'en', + 'value': 'This is audio that I am trying to ' + 'transcribe.', + 'status': 'complete', + } + ] + }, + 'automatic_translation': { + 'es': { + '_dateCreated': '', + '_dateModified': '', + '_versions': [ + { + '_dateCreated': '', + '_dateAccepted': '', + '_dependency': {'_actionId': 'manual_transcription', + '_uuid': 'a0030a86-d207-4249-8335-9a767fbd77eb'}, + '_uuid':'', + 'language': 'es', + 'value': 'Esto es un audio que estoy intendando a transcribir', + 'status': 'complete' + } + ] + } + }, + 'manual_transcription': { + '_dateCreated': '', + '_dateModified': '', + '_versions': [ + { + '_dateCreated': '', + '_dateAccepted': '', + '_uuid':'', + 'language': 'en', + 'value': 'This is audio that I am trying to ' + 'transcribe but i edited it.', + } + ] + }, + 'manual_translation': { + 'es': { + '_dateCreated': '', + '_dateModified': '', + '_versions': [ + { + '_dateCreated': '', + '_dateAccepted': '', + '_dependency': {'_actionId': 'automatic_transcription', + '_uuid': 'a0030a86-d207-4249-8335-9a767fbd77eb'}, + '_uuid':'', + 'language': 'es', + 'value': 'Esto es un audio que estoy intendando a transcribir pero yo lo edité', + 'status': 'complete' + } + ] + } + }, + } + } + self.assertEqual(True, False) # add assertion here diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index 8276e6ae38..d3f17cbff6 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -64,3 +64,6 @@ def migrate_submission_supplementals(supplemental_data:dict) -> dict: pass if action == 'translation': pass + +def get_automated_transcriptions_by_language(action_results:dict) -> dict: + pass From 8e82e61a8c42b6656ebffc9d9787c51086e41e92 Mon Sep 17 00:00:00 2001 From: rgraber Date: Thu, 23 Oct 2025 12:39:34 -0400 Subject: [PATCH 03/18] fixup!: stuff --- .../subsequences/tests/test_versioning.py | 132 ++++++++++++++++ kobo/apps/subsequences/utils/versioning.py | 142 +++++++++++++++--- 2 files changed, 257 insertions(+), 17 deletions(-) diff --git a/kobo/apps/subsequences/tests/test_versioning.py b/kobo/apps/subsequences/tests/test_versioning.py index 6372b033c7..90aa65957d 100644 --- a/kobo/apps/subsequences/tests/test_versioning.py +++ b/kobo/apps/subsequences/tests/test_versioning.py @@ -1,6 +1,138 @@ +from datetime import timedelta + +import pytest +from mock import patch +from ddt import data, ddt from django.test import TestCase +from django.utils import timezone +from freezegun import freeze_time + +from kobo.apps.subsequences.utils.versioning import ( + new_transcript_revision_from_old, + separate_transcriptions, migrate_advanced_features, migrate_submission_supplementals, +) + +@ddt class TestVersioning(TestCase): + def test_new_transcript_revision_from_old(self): + now = timezone.now() + old = { + 'dateCreated': None, + 'dateModified': '2025-10-22 17:09:38', + 'languageCode': 'en', + 'value': 'Transcribed new', + } + with freeze_time(now): + result = new_transcript_revision_from_old(old) + assert result['value'] == old['value'] + assert result['language'] == old['languageCode'] + assert result['_dateCreated'] == old['dateModified'] + assert result['_uuid'] is not None + assert result['_dateAccepted'] is None + + def test_new_transcript_revision_from_old_returns_none_for_bad_data(self): + old = {'badly': 'formatted'} + assert new_transcript_revision_from_old(old) is None + + @data(True, False) + def test_separate_automated_and_manual_transcriptions(self, latest_is_automated): + now = timezone.now() + yesterday = timezone.now() - timedelta(days=1) + transcript_dict = { + 'dateCreated': None, + 'dateModified': now, + 'languageCode': 'en', + 'revisions': [ + { + 'dateModified': yesterday, + 'languageCode': 'en', + 'value': 'Old transcript', + } + ], + 'value': 'Latest transcript', + } + automated_transcription_value = ( + 'Latest transcript' if latest_is_automated else 'Old transcript' + ) + manual, automated = separate_transcriptions( + transcript_dict, 'en', automated_transcription_value + ) + new_automated_transcript = automated[0] + new_manual_transcript = manual[0] + expected_most_recent_transcript = ( + new_automated_transcript if latest_is_automated else new_manual_transcript + ) + expected_old_transcript = ( + new_manual_transcript if latest_is_automated else new_automated_transcript + ) + + assert expected_most_recent_transcript['_dateCreated'] == now + assert expected_most_recent_transcript['value'] == 'Latest transcript' + assert expected_old_transcript['_dateCreated'] == yesterday + assert expected_old_transcript['value'] == 'Old transcript' + + def test_migrate_transcriptions(self): + now = timezone.now() + one_year_ago = now - timedelta(days=365) + old_version = {'Audio_question': {'googlets': {'languageCode': 'en', + 'regionCode': None, + 'status': 'complete', + 'value': 'This is audio that I am trying to ' + 'transcribe.'}, + 'transcript': {'dateCreated': one_year_ago, + 'dateModified': now, + 'languageCode': 'en', + 'revisions': [{'dateModified': one_year_ago, + 'languageCode': 'en', + 'value': 'This is audio that ' + 'I am trying to ' + 'transcribe.'}, + {}], + 'value': 'This is audio that I am trying to ' + 'transcribe but i edited it.'}, + } + } + with patch('kobo.apps.subsequences.utils.versioning.generate_uuid_for_form', side_effect=['uuid1', 'uuid2']): + with freeze_time(now): + migrated = migrate_submission_supplementals(old_version) + expected = { + '_version': '20250820', + 'Audio_question': { + 'automatic_transcription': { + '_dateCreated': one_year_ago, + '_dateModified': one_year_ago, + '_versions': [ + { + '_dateCreated': one_year_ago, + '_dateAccepted': now, + '_uuid':'uuid2', + 'language': 'en', + 'value': 'This is audio that I am trying to ' + 'transcribe.', + 'status': 'complete', + } + ] + }, + 'manual_transcription': { + '_dateCreated': now, + '_dateModified': now, + '_versions': [ + { + '_dateCreated': now, + '_dateAccepted': None, + '_uuid':'uuid1', + 'language': 'en', + 'value': 'This is audio that I am trying to ' + 'transcribe but i edited it.', + } + ] + }, + } + } + assert migrated == expected + + @pytest.mark.skip() def test_migrate_submission_extra_to_supplemental(self): old_version = {'Audio_question': {'googlets': {'languageCode': 'en', 'regionCode': None, diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index d3f17cbff6..931d2b38c9 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -1,5 +1,13 @@ +from django.utils import timezone + +from ..actions import ManualTranscriptionAction +from ...openrosa.libs.utils.model_tools import generate_uuid_for_form from ..constants import SCHEMA_VERSIONS +from ...subsequences__old.actions.automatic_transcription import AutomaticTranscriptionAction + +class InvalidSupplementalFormat(Exception): + pass def migrate_advanced_features(advanced_features: dict) -> dict | None: @@ -13,7 +21,6 @@ def migrate_advanced_features(advanced_features: dict) -> dict | None: actionConfigs = migrated_advanced_features['_actionConfigs'] for key, value in advanced_features.items(): - print(f'{key=}, {value=}') if ( key == 'transcript' and value @@ -35,7 +42,6 @@ def migrate_advanced_features(advanced_features: dict) -> dict | None: ] if key == 'qual': - survey_qs = value['qual_survey'] raise NotImplementedError return migrated_advanced_features @@ -51,19 +57,121 @@ def migrate_submission_supplementals(supplemental_data:dict) -> dict: supplemental = { '_version': SCHEMA_VERSIONS[0], } - for question_xpath, action_results in supplemental_data: + for question_xpath, action_results in supplemental_data.items(): question_results_by_action = {} - for action, results in action_results: - if action == 'googlets': - pass - if action == 'googletx': - pass - if action == 'qual': - pass - if action == 'transcript': - pass - if action == 'translation': - pass - -def get_automated_transcriptions_by_language(action_results:dict) -> dict: - pass + automatic_transcript_language, automatic_transcript_result = ( + get_automatic_transcription(action_results) + ) + manual_transcripts, automatic_transcripts = separate_transcriptions( + action_results.get('transcript', None), + automatic_transcript_language, + automatic_transcript_result, + ) + # should already be sorted by date created descending, but just in case + manual_transcripts.sort(reverse=True, key=lambda d: d['_dateCreated']) + automatic_transcripts.sort(reverse=True, key=lambda d: d['_dateCreated']) + + if len(manual_transcripts) > 0: + question_results_by_action['manual_transcription'] = { + '_dateCreated': manual_transcripts[-1]['_dateCreated'], + '_dateModified': manual_transcripts[0]['_dateCreated'], + '_versions': manual_transcripts, + } + if len(automatic_transcripts) > 0: + question_results_by_action['automatic_transcription'] = { + '_dateCreated': automatic_transcripts[-1]['_dateCreated'], + '_dateModified': automatic_transcripts[0]['_dateCreated'], + '_versions': automatic_transcripts, + } + supplemental[question_xpath] = question_results_by_action + + # translation + # get source + tagged_manual_transcripts = [{**transcript, '_actionId': ManualTranscriptionAction.ID} for transcript in manual_transcripts] + tagged_automatic_transcripts = [{**transcript, '_actionId': AutomaticTranscriptionAction.ID} for transcript in automatic_transcripts] + + all_tagged_transcripts = [*tagged_manual_transcripts, *tagged_automatic_transcripts] + all_tagged_transcripts.sort(reverse=True, key=lambda d: d['_dateCreated']) + + most_recent_transcript_uuids_by_language = {} + for transcript in all_tagged_transcripts: + if most_recent_transcript_uuids_by_language.get(transcript['language']) is None: + most_recent_transcript_uuids_by_language[transcript['language']] = {'_uuid': transcript['_uuid'], '_actionId': transcript['_actionId']} + + translations_dict = action_results.get('translation', {}) + for language_code, translations in translations_dict.items(): + pass + + + + + return supplemental + + +def get_automatic_transcription( + action_results: dict, +) -> tuple[str | None, str | None] | None: + googlets = action_results.get('googlets', {}) + return googlets.get('languageCode', None), googlets.get('value', None) + +def get_automatic_translation(action_results:dict): + googletx = action_results.get('googletx', {}) + return googletx.get('source', None), googletx.get('languageCode', None), googletx.get('value', None) + + + +def new_transcript_revision_from_old(old_transcript_revision_dict: dict) -> dict | None: + # ignore bad data + if ( + 'languageCode' not in old_transcript_revision_dict + or 'value' not in old_transcript_revision_dict + ): + return None + return { + '_dateCreated': old_transcript_revision_dict.get('dateModified', None), + 'language': old_transcript_revision_dict['languageCode'], + 'value': old_transcript_revision_dict['value'], + '_uuid': generate_uuid_for_form(), + '_dateAccepted': None, + } + + +def separate_transcriptions( + transcription_dict: dict, + automatic_transcript_language: str = None, + automatic_transcript_value: str = None, +) -> tuple[list, list]: + if not transcription_dict: + return [], [] + automatic_transcriptions = [] + manual_transcriptions = [] + latest_revision = new_transcript_revision_from_old(transcription_dict) + if latest_revision: + if ( + latest_revision['value'] == automatic_transcript_value + and latest_revision['language'] == automatic_transcript_language + ): + latest_revision['status'] = 'complete' + latest_revision['_dateAccepted'] = timezone.now() + automatic_transcriptions.append(latest_revision) + else: + manual_transcriptions.append(latest_revision) + + for revision in transcription_dict.get('revisions', []): + revision_formatted = new_transcript_revision_from_old(revision) + if revision_formatted is None: + continue + if ( + revision_formatted['language'] == automatic_transcript_language + and revision['value'] == automatic_transcript_value + ): + revision_formatted['status'] = 'complete' + revision_formatted['_dateAccepted'] = timezone.now() + automatic_transcriptions.append(revision_formatted) + else: + manual_transcriptions.append(revision_formatted) + return manual_transcriptions, automatic_transcriptions + +def separate_translations(translation_dict): + if not translation_dict: + return [],[] From 89f236f140114ee93a95d90bb8033a7848ca1079 Mon Sep 17 00:00:00 2001 From: rgraber Date: Thu, 23 Oct 2025 14:31:54 -0400 Subject: [PATCH 04/18] fixup!: stuff --- kobo/apps/subsequences/utils/versioning.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index 931d2b38c9..2557290d1b 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -1,9 +1,7 @@ from django.utils import timezone -from ..actions import ManualTranscriptionAction from ...openrosa.libs.utils.model_tools import generate_uuid_for_form from ..constants import SCHEMA_VERSIONS -from ...subsequences__old.actions.automatic_transcription import AutomaticTranscriptionAction class InvalidSupplementalFormat(Exception): @@ -87,8 +85,8 @@ def migrate_submission_supplementals(supplemental_data:dict) -> dict: # translation # get source - tagged_manual_transcripts = [{**transcript, '_actionId': ManualTranscriptionAction.ID} for transcript in manual_transcripts] - tagged_automatic_transcripts = [{**transcript, '_actionId': AutomaticTranscriptionAction.ID} for transcript in automatic_transcripts] + tagged_manual_transcripts = [{**transcript, '_actionId': 'manual_transcription'} for transcript in manual_transcripts] + tagged_automatic_transcripts = [{**transcript, '_actionId': 'manual_translation'} for transcript in automatic_transcripts] all_tagged_transcripts = [*tagged_manual_transcripts, *tagged_automatic_transcripts] all_tagged_transcripts.sort(reverse=True, key=lambda d: d['_dateCreated']) From cc9ce0135b44941fc41c1fe3c261a8776d09d03f Mon Sep 17 00:00:00 2001 From: rgraber Date: Fri, 24 Oct 2025 08:26:39 -0400 Subject: [PATCH 05/18] fixup!: stuff --- kobo/apps/subsequences/utils/versioning.py | 70 +++++++++++++++++++++- 1 file changed, 67 insertions(+), 3 deletions(-) diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index 2557290d1b..8e14ee4136 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -133,6 +133,25 @@ def new_transcript_revision_from_old(old_transcript_revision_dict: dict) -> dict '_dateAccepted': None, } +def new_translation_revision_from_old(old_translation_revision_dict: dict, source_uuid, source_action) -> dict | None: + # ignore bad data + if ( + 'languageCode' not in old_translation_revision_dict + or 'value' not in old_translation_revision_dict + ): + return None + return { + '_dateCreated': old_translation_revision_dict.get('dateModified', None), + 'language': old_translation_revision_dict['languageCode'], + 'value': old_translation_revision_dict['value'], + '_uuid': generate_uuid_for_form(), + '_dateAccepted': None, + '_dependency': { + '_actionId': source_action, + '_uuid': source_uuid, + } + } + def separate_transcriptions( transcription_dict: dict, @@ -170,6 +189,51 @@ def separate_transcriptions( manual_transcriptions.append(revision_formatted) return manual_transcriptions, automatic_transcriptions -def separate_translations(translation_dict): - if not translation_dict: - return [],[] +def separate_translations(language, translation_dict, + automatic_translation_language: str = None, + automatic_translation_value: str = None, source_uuid=None, source_action_id=None): + """ + {'es': {'dateCreated': '2025-10-22T14:30:38Z', + 'dateModified': '2025-10-22T17:10:23Z', + 'languageCode': 'es', + 'revisions': [{'dateModified': '2025-10-22T14:30:38Z', + 'languageCode': 'es', + 'value': 'Este es un ' + 'audio que ' + 'estoy ' + 'intentando ' + 'transcribir.'}], + 'value': 'Este es un audio que ' + 'estoy intentando ' + 'transcribir pero yo lo edité'}} + """ + automatic_translations = [] + manual_translations = [] + latest_revision = new_translation_revision_from_old(translation_dict, source_uuid=source_uuid, source_action=source_action_id) + if latest_revision: + if ( + latest_revision['value'] == automatic_translation_value + and latest_revision['language'] == automatic_translation_language + ): + latest_revision['status'] = 'complete' + latest_revision['_dateAccepted'] = timezone.now() + automatic_translations.append(latest_revision) + else: + manual_translations.append(latest_revision) + + for revision in translation_dict.get('revisions', []): + revision_formatted = new_transcript_revision_from_old(revision) + if revision_formatted is None: + continue + if ( + revision_formatted['language'] == automatic_translation_language + and revision['value'] == automatic_translation_value + ): + revision_formatted['status'] = 'complete' + revision_formatted['_dateAccepted'] = timezone.now() + automatic_translations.append(revision_formatted) + else: + manual_translations.append(revision_formatted) + return manual_translations, automatic_translations + + From 9b31b20566dcc685187fd215962792552ab2f319 Mon Sep 17 00:00:00 2001 From: rgraber Date: Fri, 24 Oct 2025 13:01:13 -0400 Subject: [PATCH 06/18] fixup!: stuff --- .../subsequences/tests/test_versioning.py | 52 +++++++- kobo/apps/subsequences/utils/versioning.py | 121 ++++++++++++------ 2 files changed, 126 insertions(+), 47 deletions(-) diff --git a/kobo/apps/subsequences/tests/test_versioning.py b/kobo/apps/subsequences/tests/test_versioning.py index 90aa65957d..4253431d87 100644 --- a/kobo/apps/subsequences/tests/test_versioning.py +++ b/kobo/apps/subsequences/tests/test_versioning.py @@ -1,15 +1,17 @@ from datetime import timedelta +from unittest.mock import patch import pytest -from mock import patch from ddt import data, ddt from django.test import TestCase from django.utils import timezone from freezegun import freeze_time from kobo.apps.subsequences.utils.versioning import ( - new_transcript_revision_from_old, - separate_transcriptions, migrate_advanced_features, migrate_submission_supplementals, + determine_source_transcripts, + migrate_submission_supplementals, + new_revision_from_old, + separate_transcriptions, ) @@ -24,7 +26,7 @@ def test_new_transcript_revision_from_old(self): 'value': 'Transcribed new', } with freeze_time(now): - result = new_transcript_revision_from_old(old) + result = new_revision_from_old(old) assert result['value'] == old['value'] assert result['language'] == old['languageCode'] assert result['_dateCreated'] == old['dateModified'] @@ -33,7 +35,7 @@ def test_new_transcript_revision_from_old(self): def test_new_transcript_revision_from_old_returns_none_for_bad_data(self): old = {'badly': 'formatted'} - assert new_transcript_revision_from_old(old) is None + assert new_revision_from_old(old) is None @data(True, False) def test_separate_automated_and_manual_transcriptions(self, latest_is_automated): @@ -132,6 +134,46 @@ def test_migrate_transcriptions(self): } assert migrated == expected + def test_determine_source_transcripts(self): + manual_transcripts = [] + automatic_transcripts = [] + now = timezone.now() + for i in range(5): + manual = { + '_dateCreated': now - timedelta(days=i), + 'language': 'en', + 'value': 'Value', + '_uuid': f'uuid-{i}-manual', + '_dateAccepted': None, + } + automatic = { + **manual, + '_uuid': f'uuid-{i}-automatic', + '_dateCreated': now - timedelta(days=i + 1), + } + manual_transcripts.append(manual) + automatic_transcripts.append(automatic) + # add an old transcript in a different language + manual_transcripts.append( + { + '_dateCreated': now - timedelta(days=5), + 'language': 'fr', + 'value': 'Value', + '_uuid': f'uuid-5-manual', + '_dateAccepted': None, + } + ) + most_recent_overall, most_recent_by_language = determine_source_transcripts( + manual_transcripts, automatic_transcripts + ) + assert most_recent_overall['_uuid'] == 'uuid-0-manual' + assert most_recent_overall['_actionId'] == 'manual_transcription' + assert most_recent_by_language['en']['_uuid'] == 'uuid-0-manual' + assert most_recent_by_language['en']['_actionId'] == 'manual_transcription' + assert most_recent_by_language['fr']['_uuid'] == 'uuid-5-manual' + assert most_recent_by_language['fr']['_actionId'] == 'manual_transcription' + + @pytest.mark.skip() def test_migrate_submission_extra_to_supplemental(self): old_version = {'Audio_question': {'googlets': {'languageCode': 'en', diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index 8e14ee4136..a02f0a5e82 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -84,26 +84,64 @@ def migrate_submission_supplementals(supplemental_data:dict) -> dict: supplemental[question_xpath] = question_results_by_action # translation - # get source - tagged_manual_transcripts = [{**transcript, '_actionId': 'manual_transcription'} for transcript in manual_transcripts] - tagged_automatic_transcripts = [{**transcript, '_actionId': 'manual_translation'} for transcript in automatic_transcripts] - - all_tagged_transcripts = [*tagged_manual_transcripts, *tagged_automatic_transcripts] - all_tagged_transcripts.sort(reverse=True, key=lambda d: d['_dateCreated']) - - most_recent_transcript_uuids_by_language = {} - for transcript in all_tagged_transcripts: - if most_recent_transcript_uuids_by_language.get(transcript['language']) is None: - most_recent_transcript_uuids_by_language[transcript['language']] = {'_uuid': transcript['_uuid'], '_actionId': transcript['_actionId']} + # determine what to use as the source transcript + most_recent_transcript, most_recent_transcript_by_language = ( + determine_source_transcripts(manual_transcripts, automatic_transcripts) + ) + ( + automatic_translation_source_language, + automatic_translation_language, + automatic_translation_value, + ) = get_automatic_translation(action_results) translations_dict = action_results.get('translation', {}) + automatic_translations = {} + manual_translations = {} for language_code, translations in translations_dict.items(): + automatic_translations_for_language = separate_translations( + language_code, + automatic_translation_source_language, + automatic_translation_language, + automatic_translation_value, + most_recent_transcript, + most_recent_transcript_by_language, + ) pass + return supplemental +def determine_source_transcripts(manual_transcripts, automatic_transcripts): + # First combine manual and automatic transcripts and sort by dateCreated descending + # tag them with the action so we don't lose track + tagged_manual_transcripts = [ + {**transcript, '_actionId': 'manual_transcription'} + for transcript in manual_transcripts + ] + tagged_automatic_transcripts = [ + {**transcript, '_actionId': 'automatic_translation'} + for transcript in automatic_transcripts + ] + + all_tagged_transcripts = [*tagged_manual_transcripts, *tagged_automatic_transcripts] + all_tagged_transcripts.sort(reverse=True, key=lambda d: d['_dateCreated']) + + # take the most recent transcript, manual or automatic, by language + most_recent_transcript_uuids_by_language = {} + for transcript in all_tagged_transcripts: + if most_recent_transcript_uuids_by_language.get(transcript['language']) is None: + most_recent_transcript_uuids_by_language[transcript['language']] = { + '_uuid': transcript['_uuid'], + '_actionId': transcript['_actionId'], + } - return supplemental + # we don't always know the source language of a translation, so also get the most recent transcript overall + most_recent_transcript_overall = all_tagged_transcripts[0] + most_recent_transcript_overall = { + '_uuid': most_recent_transcript_overall['_uuid'], + '_actionId': most_recent_transcript_overall['_actionId'], + } + return most_recent_transcript_overall, most_recent_transcript_uuids_by_language def get_automatic_transcription( @@ -114,11 +152,14 @@ def get_automatic_transcription( def get_automatic_translation(action_results:dict): googletx = action_results.get('googletx', {}) - return googletx.get('source', None), googletx.get('languageCode', None), googletx.get('value', None) - + return ( + googletx.get('source', None), + googletx.get('languageCode', None), + googletx.get('value', None), + ) -def new_transcript_revision_from_old(old_transcript_revision_dict: dict) -> dict | None: +def new_revision_from_old(old_transcript_revision_dict: dict) -> dict | None: # ignore bad data if ( 'languageCode' not in old_transcript_revision_dict @@ -133,25 +174,6 @@ def new_transcript_revision_from_old(old_transcript_revision_dict: dict) -> dict '_dateAccepted': None, } -def new_translation_revision_from_old(old_translation_revision_dict: dict, source_uuid, source_action) -> dict | None: - # ignore bad data - if ( - 'languageCode' not in old_translation_revision_dict - or 'value' not in old_translation_revision_dict - ): - return None - return { - '_dateCreated': old_translation_revision_dict.get('dateModified', None), - 'language': old_translation_revision_dict['languageCode'], - 'value': old_translation_revision_dict['value'], - '_uuid': generate_uuid_for_form(), - '_dateAccepted': None, - '_dependency': { - '_actionId': source_action, - '_uuid': source_uuid, - } - } - def separate_transcriptions( transcription_dict: dict, @@ -162,7 +184,7 @@ def separate_transcriptions( return [], [] automatic_transcriptions = [] manual_transcriptions = [] - latest_revision = new_transcript_revision_from_old(transcription_dict) + latest_revision = new_revision_from_old(transcription_dict) if latest_revision: if ( latest_revision['value'] == automatic_transcript_value @@ -175,7 +197,7 @@ def separate_transcriptions( manual_transcriptions.append(latest_revision) for revision in transcription_dict.get('revisions', []): - revision_formatted = new_transcript_revision_from_old(revision) + revision_formatted = new_revision_from_old(revision) if revision_formatted is None: continue if ( @@ -189,9 +211,16 @@ def separate_transcriptions( manual_transcriptions.append(revision_formatted) return manual_transcriptions, automatic_transcriptions -def separate_translations(language, translation_dict, + +def separate_translations( + language, + translation_dict, + automatic_translation_source_language: str = None, automatic_translation_language: str = None, - automatic_translation_value: str = None, source_uuid=None, source_action_id=None): + automatic_translation_value: str = None, + most_recent_transcript=None, + most_recent_transcript_by_language=None, +): """ {'es': {'dateCreated': '2025-10-22T14:30:38Z', 'dateModified': '2025-10-22T17:10:23Z', @@ -209,7 +238,7 @@ def separate_translations(language, translation_dict, """ automatic_translations = [] manual_translations = [] - latest_revision = new_translation_revision_from_old(translation_dict, source_uuid=source_uuid, source_action=source_action_id) + latest_revision = new_revision_from_old(translation_dict) if latest_revision: if ( latest_revision['value'] == automatic_translation_value @@ -217,12 +246,17 @@ def separate_translations(language, translation_dict, ): latest_revision['status'] = 'complete' latest_revision['_dateAccepted'] = timezone.now() + source = most_recent_transcript_by_language.get( + automatic_translation_source_language, most_recent_transcript + ) + latest_revision['source'] = source automatic_translations.append(latest_revision) else: + latest_revision['source'] = most_recent_transcript manual_translations.append(latest_revision) for revision in translation_dict.get('revisions', []): - revision_formatted = new_transcript_revision_from_old(revision) + revision_formatted = new_revision_from_old(revision) if revision_formatted is None: continue if ( @@ -231,9 +265,12 @@ def separate_translations(language, translation_dict, ): revision_formatted['status'] = 'complete' revision_formatted['_dateAccepted'] = timezone.now() + source = most_recent_transcript_by_language.get( + automatic_translation_source_language, most_recent_transcript + ) + revision_formatted['source'] = source automatic_translations.append(revision_formatted) else: + revision_formatted['source'] = most_recent_transcript manual_translations.append(revision_formatted) return manual_translations, automatic_translations - - From ef8655a38cc586ef1f11a08ca4837b3065389245 Mon Sep 17 00:00:00 2001 From: rgraber Date: Tue, 28 Oct 2025 10:22:53 -0400 Subject: [PATCH 07/18] fixup!: stuff --- .../subsequences/tests/test_versioning.py | 72 ++++++++++--------- kobo/apps/subsequences/utils/versioning.py | 15 ++-- 2 files changed, 49 insertions(+), 38 deletions(-) diff --git a/kobo/apps/subsequences/tests/test_versioning.py b/kobo/apps/subsequences/tests/test_versioning.py index 4253431d87..416d8b502a 100644 --- a/kobo/apps/subsequences/tests/test_versioning.py +++ b/kobo/apps/subsequences/tests/test_versioning.py @@ -173,9 +173,14 @@ def test_determine_source_transcripts(self): assert most_recent_by_language['fr']['_uuid'] == 'uuid-5-manual' assert most_recent_by_language['fr']['_actionId'] == 'manual_transcription' + def test_migrate_translations(self): + pass + @pytest.mark.skip() def test_migrate_submission_extra_to_supplemental(self): + now = timezone.now() + one_year_ago = now - timedelta(days=365) old_version = {'Audio_question': {'googlets': {'languageCode': 'en', 'regionCode': None, 'status': 'complete', @@ -186,11 +191,10 @@ def test_migrate_submission_extra_to_supplemental(self): 'status': 'complete', 'value': 'Este es un audio que estoy ' 'intentando transcribir.'}, - 'transcript': {'dateCreated': None, - 'dateModified': '2025-10-22 17:09:38', + 'transcript': {'dateCreated': one_year_ago, + 'dateModified': now, 'languageCode': 'en', - 'revisions': [{'dateModified': '2025-10-22 ' - '14:30:24', + 'revisions': [{'dateModified': one_year_ago, 'languageCode': 'en', 'value': 'This is audio that ' 'I am trying to ' @@ -198,10 +202,10 @@ def test_migrate_submission_extra_to_supplemental(self): {}], 'value': 'This is audio that I am trying to ' 'transcribe but i edited it.'}, - 'translation': {'es': {'dateCreated': '2025-10-22T14:30:38Z', - 'dateModified': '2025-10-22T17:10:23Z', + 'translation': {'es': {'dateCreated': one_year_ago, + 'dateModified': now, 'languageCode': 'es', - 'revisions': [{'dateModified': '2025-10-22T14:30:38Z', + 'revisions': [{'dateModified': one_year_ago, 'languageCode': 'es', 'value': 'Este es un ' 'audio que ' @@ -212,17 +216,21 @@ def test_migrate_submission_extra_to_supplemental(self): 'estoy intentando ' 'transcribir pero yo lo edité'}}}} + with patch('kobo.apps.subsequences.utils.versioning.generate_uuid_for_form', side_effect=['uuid1', 'uuid2', 'uuid3', 'uuid4']): + with freeze_time(now): + migrated = migrate_submission_supplementals(old_version) + new_version = { '_version': '20250820', 'Audio_question': { 'automatic_transcription': { - '_dateCreated': '', - '_dateModified': '', + '_dateCreated': one_year_ago, + '_dateModified': one_year_ago, '_versions': [ { - '_dateCreated': '', - '_dateAccepted': '', - '_uuid':'', + '_dateCreated': one_year_ago, + '_dateAccepted': now, + '_uuid':'uuid2', 'language': 'en', 'value': 'This is audio that I am trying to ' 'transcribe.', @@ -232,15 +240,15 @@ def test_migrate_submission_extra_to_supplemental(self): }, 'automatic_translation': { 'es': { - '_dateCreated': '', - '_dateModified': '', + '_dateCreated': one_year_ago, + '_dateModified': one_year_ago, '_versions': [ { - '_dateCreated': '', - '_dateAccepted': '', + '_dateCreated': one_year_ago, + '_dateAccepted': now, '_dependency': {'_actionId': 'manual_transcription', - '_uuid': 'a0030a86-d207-4249-8335-9a767fbd77eb'}, - '_uuid':'', + '_uuid': 'uuid1'}, + '_uuid':'uuid4', 'language': 'es', 'value': 'Esto es un audio que estoy intendando a transcribir', 'status': 'complete' @@ -249,13 +257,13 @@ def test_migrate_submission_extra_to_supplemental(self): } }, 'manual_transcription': { - '_dateCreated': '', - '_dateModified': '', + '_dateCreated': now, + '_dateModified': now, '_versions': [ { - '_dateCreated': '', - '_dateAccepted': '', - '_uuid':'', + '_dateCreated': now, + '_dateAccepted': None, + '_uuid':'uuid1', 'language': 'en', 'value': 'This is audio that I am trying to ' 'transcribe but i edited it.', @@ -264,15 +272,15 @@ def test_migrate_submission_extra_to_supplemental(self): }, 'manual_translation': { 'es': { - '_dateCreated': '', - '_dateModified': '', + '_dateCreated': now, + '_dateModified': now, '_versions': [ { - '_dateCreated': '', - '_dateAccepted': '', - '_dependency': {'_actionId': 'automatic_transcription', - '_uuid': 'a0030a86-d207-4249-8335-9a767fbd77eb'}, - '_uuid':'', + '_dateCreated': now, + '_dateAccepted': now, + '_dependency': {'_actionId': 'manual_transcription', + '_uuid': 'uuid1'}, + '_uuid':'uuid3', 'language': 'es', 'value': 'Esto es un audio que estoy intendando a transcribir pero yo lo edité', 'status': 'complete' @@ -282,7 +290,5 @@ def test_migrate_submission_extra_to_supplemental(self): }, } } - - - self.assertEqual(True, False) # add assertion here + assert migrated == new_version # add assertion here diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index a02f0a5e82..2f2f4bed65 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -81,7 +81,6 @@ def migrate_submission_supplementals(supplemental_data:dict) -> dict: '_dateModified': automatic_transcripts[0]['_dateCreated'], '_versions': automatic_transcripts, } - supplemental[question_xpath] = question_results_by_action # translation # determine what to use as the source transcript @@ -98,15 +97,21 @@ def migrate_submission_supplementals(supplemental_data:dict) -> dict: automatic_translations = {} manual_translations = {} for language_code, translations in translations_dict.items(): - automatic_translations_for_language = separate_translations( + automatic_translations_for_language, manual_translations_for_language = separate_translations( language_code, + translations, automatic_translation_source_language, automatic_translation_language, automatic_translation_value, most_recent_transcript, most_recent_transcript_by_language, ) - pass + automatic_translations[language_code] = automatic_translations_for_language + manual_translations[language_code] = manual_translations_for_language + question_results_by_action['automatic_translation'] = automatic_translations + question_results_by_action['manual_translation'] = manual_translations + supplemental[question_xpath] = question_results_by_action + return supplemental @@ -242,7 +247,7 @@ def separate_translations( if latest_revision: if ( latest_revision['value'] == automatic_translation_value - and latest_revision['language'] == automatic_translation_language + and language == automatic_translation_language ): latest_revision['status'] = 'complete' latest_revision['_dateAccepted'] = timezone.now() @@ -260,7 +265,7 @@ def separate_translations( if revision_formatted is None: continue if ( - revision_formatted['language'] == automatic_translation_language + language == automatic_translation_language and revision['value'] == automatic_translation_value ): revision_formatted['status'] = 'complete' From 3c35f145fa34152fe5f436f1ccbef57c64972ce7 Mon Sep 17 00:00:00 2001 From: rgraber Date: Tue, 28 Oct 2025 13:14:26 -0400 Subject: [PATCH 08/18] fixup!: messy but functional --- .../subsequences/tests/test_versioning.py | 40 +++++++++---------- kobo/apps/subsequences/utils/versioning.py | 39 ++++++++++++------ 2 files changed, 45 insertions(+), 34 deletions(-) diff --git a/kobo/apps/subsequences/tests/test_versioning.py b/kobo/apps/subsequences/tests/test_versioning.py index 416d8b502a..9cff737787 100644 --- a/kobo/apps/subsequences/tests/test_versioning.py +++ b/kobo/apps/subsequences/tests/test_versioning.py @@ -177,10 +177,9 @@ def test_migrate_translations(self): pass - @pytest.mark.skip() def test_migrate_submission_extra_to_supplemental(self): now = timezone.now() - one_year_ago = now - timedelta(days=365) + one_year_ago = (now - timedelta(days=365)).isoformat() old_version = {'Audio_question': {'googlets': {'languageCode': 'en', 'regionCode': None, 'status': 'complete', @@ -189,10 +188,10 @@ def test_migrate_submission_extra_to_supplemental(self): 'googletx': {'languageCode': 'es', 'source': 'en', 'status': 'complete', - 'value': 'Este es un audio que estoy ' - 'intentando transcribir.'}, + 'value': 'Esto es un audio que estoy ' + 'intentando a transcribir.'}, 'transcript': {'dateCreated': one_year_ago, - 'dateModified': now, + 'dateModified': now.isoformat(), 'languageCode': 'en', 'revisions': [{'dateModified': one_year_ago, 'languageCode': 'en', @@ -203,16 +202,16 @@ def test_migrate_submission_extra_to_supplemental(self): 'value': 'This is audio that I am trying to ' 'transcribe but i edited it.'}, 'translation': {'es': {'dateCreated': one_year_ago, - 'dateModified': now, + 'dateModified': now.isoformat(), 'languageCode': 'es', 'revisions': [{'dateModified': one_year_ago, 'languageCode': 'es', - 'value': 'Este es un ' + 'value': 'Esto es un ' 'audio que ' 'estoy ' - 'intentando ' + 'intentando a ' 'transcribir.'}], - 'value': 'Este es un audio que ' + 'value': 'Esto es un audio que ' 'estoy intentando ' 'transcribir pero yo lo edité'}}}} @@ -229,7 +228,7 @@ def test_migrate_submission_extra_to_supplemental(self): '_versions': [ { '_dateCreated': one_year_ago, - '_dateAccepted': now, + '_dateAccepted': now.isoformat(), '_uuid':'uuid2', 'language': 'en', 'value': 'This is audio that I am trying to ' @@ -245,23 +244,23 @@ def test_migrate_submission_extra_to_supplemental(self): '_versions': [ { '_dateCreated': one_year_ago, - '_dateAccepted': now, + '_dateAccepted': now.isoformat(), '_dependency': {'_actionId': 'manual_transcription', '_uuid': 'uuid1'}, '_uuid':'uuid4', 'language': 'es', - 'value': 'Esto es un audio que estoy intendando a transcribir', + 'value': 'Esto es un audio que estoy intentando a transcribir.', 'status': 'complete' } ] } }, 'manual_transcription': { - '_dateCreated': now, - '_dateModified': now, + '_dateCreated': now.isoformat(), + '_dateModified': now.isoformat(), '_versions': [ { - '_dateCreated': now, + '_dateCreated': now.isoformat(), '_dateAccepted': None, '_uuid':'uuid1', 'language': 'en', @@ -272,18 +271,17 @@ def test_migrate_submission_extra_to_supplemental(self): }, 'manual_translation': { 'es': { - '_dateCreated': now, - '_dateModified': now, + '_dateCreated': now.isoformat(), + '_dateModified': now.isoformat(), '_versions': [ { - '_dateCreated': now, - '_dateAccepted': now, + '_dateCreated': now.isoformat(), + '_dateAccepted': None, '_dependency': {'_actionId': 'manual_transcription', '_uuid': 'uuid1'}, '_uuid':'uuid3', 'language': 'es', - 'value': 'Esto es un audio que estoy intendando a transcribir pero yo lo edité', - 'status': 'complete' + 'value': 'Esto es un audio que estoy intentando transcribir pero yo lo edité', } ] } diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index 2f2f4bed65..15e82c3c26 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -97,7 +97,7 @@ def migrate_submission_supplementals(supplemental_data:dict) -> dict: automatic_translations = {} manual_translations = {} for language_code, translations in translations_dict.items(): - automatic_translations_for_language, manual_translations_for_language = separate_translations( + manual_translations_for_language, automatic_translations_for_language = separate_translations( language_code, translations, automatic_translation_source_language, @@ -106,9 +106,22 @@ def migrate_submission_supplementals(supplemental_data:dict) -> dict: most_recent_transcript, most_recent_transcript_by_language, ) - automatic_translations[language_code] = automatic_translations_for_language - manual_translations[language_code] = manual_translations_for_language - question_results_by_action['automatic_translation'] = automatic_translations + if len(automatic_translations_for_language) > 0: + automatic_translations_for_language.sort(reverse=True, key =lambda x: x['_dateCreated']) + automatic_translations[language_code] = { + '_dateCreated': automatic_translations_for_language[-1]['_dateCreated'], + '_dateModified': automatic_translations_for_language[0]['_dateCreated'] + } + automatic_translations[language_code]['_versions'] = automatic_translations_for_language + if len(manual_translations_for_language) > 0: + manual_translations_for_language.sort(reverse=True, key =lambda x: x['_dateCreated']) + manual_translations[language_code] = { + '_dateCreated': manual_translations_for_language[-1]['_dateCreated'], + '_dateModified': manual_translations_for_language[0]['_dateCreated'] + } + manual_translations[language_code]['_versions'] = manual_translations_for_language + if automatic_translations != {}: + question_results_by_action['automatic_translation'] = automatic_translations question_results_by_action['manual_translation'] = manual_translations supplemental[question_xpath] = question_results_by_action @@ -196,7 +209,7 @@ def separate_transcriptions( and latest_revision['language'] == automatic_transcript_language ): latest_revision['status'] = 'complete' - latest_revision['_dateAccepted'] = timezone.now() + latest_revision['_dateAccepted'] = timezone.now().isoformat() automatic_transcriptions.append(latest_revision) else: manual_transcriptions.append(latest_revision) @@ -210,7 +223,7 @@ def separate_transcriptions( and revision['value'] == automatic_transcript_value ): revision_formatted['status'] = 'complete' - revision_formatted['_dateAccepted'] = timezone.now() + revision_formatted['_dateAccepted'] = timezone.now().isoformat() automatic_transcriptions.append(revision_formatted) else: manual_transcriptions.append(revision_formatted) @@ -225,7 +238,7 @@ def separate_translations( automatic_translation_value: str = None, most_recent_transcript=None, most_recent_transcript_by_language=None, -): +) -> tuple[list, list]: """ {'es': {'dateCreated': '2025-10-22T14:30:38Z', 'dateModified': '2025-10-22T17:10:23Z', @@ -250,14 +263,14 @@ def separate_translations( and language == automatic_translation_language ): latest_revision['status'] = 'complete' - latest_revision['_dateAccepted'] = timezone.now() + latest_revision['_dateAccepted'] = timezone.now().isoformat() source = most_recent_transcript_by_language.get( automatic_translation_source_language, most_recent_transcript ) - latest_revision['source'] = source + latest_revision['_dependency'] = source automatic_translations.append(latest_revision) else: - latest_revision['source'] = most_recent_transcript + latest_revision['_dependency'] = most_recent_transcript manual_translations.append(latest_revision) for revision in translation_dict.get('revisions', []): @@ -269,13 +282,13 @@ def separate_translations( and revision['value'] == automatic_translation_value ): revision_formatted['status'] = 'complete' - revision_formatted['_dateAccepted'] = timezone.now() + revision_formatted['_dateAccepted'] = timezone.now().isoformat() source = most_recent_transcript_by_language.get( automatic_translation_source_language, most_recent_transcript ) - revision_formatted['source'] = source + revision_formatted['_dependency'] = source automatic_translations.append(revision_formatted) else: - revision_formatted['source'] = most_recent_transcript + revision_formatted['_dependency'] = most_recent_transcript manual_translations.append(revision_formatted) return manual_translations, automatic_translations From 335ff049686a4e1b034245fd711deb7849873e66 Mon Sep 17 00:00:00 2001 From: rgraber Date: Tue, 28 Oct 2025 15:27:37 -0400 Subject: [PATCH 09/18] fixup!: cleaning --- .../subsequences/tests/test_versioning.py | 351 ++++++++------- kobo/apps/subsequences/utils/versioning.py | 409 ++++++++---------- 2 files changed, 360 insertions(+), 400 deletions(-) diff --git a/kobo/apps/subsequences/tests/test_versioning.py b/kobo/apps/subsequences/tests/test_versioning.py index 9cff737787..1ebce79aed 100644 --- a/kobo/apps/subsequences/tests/test_versioning.py +++ b/kobo/apps/subsequences/tests/test_versioning.py @@ -1,23 +1,41 @@ -from datetime import timedelta +from datetime import datetime, timedelta from unittest.mock import patch -import pytest -from ddt import data, ddt +from ddt import data, ddt, unpack from django.test import TestCase from django.utils import timezone from freezegun import freeze_time from kobo.apps.subsequences.utils.versioning import ( - determine_source_transcripts, + _determine_source_transcript, + _new_revision_from_old, + _separate_manual_and_automatic_versions, migrate_submission_supplementals, - new_revision_from_old, - separate_transcriptions, ) @ddt class TestVersioning(TestCase): - def test_new_transcript_revision_from_old(self): + def setUp(self): + super().setUp() + # works for translations or transcriptions + self.now = timezone.now().isoformat() + self.yesterday = (timezone.now() - timedelta(days=1)).isoformat() + self.action_dict = { + 'dateCreated': None, + 'dateModified': self.now, + 'languageCode': 'en', + 'revisions': [ + { + 'dateModified': self.yesterday, + 'languageCode': 'en', + 'value': 'Old value', + } + ], + 'value': 'Latest value', + } + + def test_new_revision_from_old(self): now = timezone.now() old = { 'dateCreated': None, @@ -26,7 +44,7 @@ def test_new_transcript_revision_from_old(self): 'value': 'Transcribed new', } with freeze_time(now): - result = new_revision_from_old(old) + result = _new_revision_from_old(old) assert result['value'] == old['value'] assert result['language'] == old['languageCode'] assert result['_dateCreated'] == old['dateModified'] @@ -35,30 +53,15 @@ def test_new_transcript_revision_from_old(self): def test_new_transcript_revision_from_old_returns_none_for_bad_data(self): old = {'badly': 'formatted'} - assert new_revision_from_old(old) is None + assert _new_revision_from_old(old) is None @data(True, False) - def test_separate_automated_and_manual_transcriptions(self, latest_is_automated): - now = timezone.now() - yesterday = timezone.now() - timedelta(days=1) - transcript_dict = { - 'dateCreated': None, - 'dateModified': now, - 'languageCode': 'en', - 'revisions': [ - { - 'dateModified': yesterday, - 'languageCode': 'en', - 'value': 'Old transcript', - } - ], - 'value': 'Latest transcript', - } + def test_separate_automatic_and_manual(self, latest_is_automated): automated_transcription_value = ( - 'Latest transcript' if latest_is_automated else 'Old transcript' + 'Latest value' if latest_is_automated else 'Old value' ) - manual, automated = separate_transcriptions( - transcript_dict, 'en', automated_transcription_value + manual, automated = _separate_manual_and_automatic_versions( + self.action_dict, 'en', automated_transcription_value ) new_automated_transcript = automated[0] new_manual_transcript = manual[0] @@ -69,153 +72,143 @@ def test_separate_automated_and_manual_transcriptions(self, latest_is_automated) new_manual_transcript if latest_is_automated else new_automated_transcript ) - assert expected_most_recent_transcript['_dateCreated'] == now - assert expected_most_recent_transcript['value'] == 'Latest transcript' - assert expected_old_transcript['_dateCreated'] == yesterday - assert expected_old_transcript['value'] == 'Old transcript' + assert expected_most_recent_transcript['_dateCreated'] == self.now + assert expected_most_recent_transcript['value'] == 'Latest value' + assert expected_old_transcript['_dateCreated'] == self.yesterday + assert expected_old_transcript['value'] == 'Old value' - def test_migrate_transcriptions(self): - now = timezone.now() - one_year_ago = now - timedelta(days=365) - old_version = {'Audio_question': {'googlets': {'languageCode': 'en', - 'regionCode': None, - 'status': 'complete', - 'value': 'This is audio that I am trying to ' - 'transcribe.'}, - 'transcript': {'dateCreated': one_year_ago, - 'dateModified': now, - 'languageCode': 'en', - 'revisions': [{'dateModified': one_year_ago, - 'languageCode': 'en', - 'value': 'This is audio that ' - 'I am trying to ' - 'transcribe.'}, - {}], - 'value': 'This is audio that I am trying to ' - 'transcribe but i edited it.'}, - } - } - with patch('kobo.apps.subsequences.utils.versioning.generate_uuid_for_form', side_effect=['uuid1', 'uuid2']): - with freeze_time(now): - migrated = migrate_submission_supplementals(old_version) - expected = { - '_version': '20250820', - 'Audio_question': { - 'automatic_transcription': { - '_dateCreated': one_year_ago, - '_dateModified': one_year_ago, - '_versions': [ - { - '_dateCreated': one_year_ago, - '_dateAccepted': now, - '_uuid':'uuid2', - 'language': 'en', - 'value': 'This is audio that I am trying to ' - 'transcribe.', - 'status': 'complete', - } - ] - }, - 'manual_transcription': { - '_dateCreated': now, - '_dateModified': now, - '_versions': [ - { - '_dateCreated': now, - '_dateAccepted': None, - '_uuid':'uuid1', - 'language': 'en', - 'value': 'This is audio that I am trying to ' - 'transcribe but i edited it.', - } - ] - }, - } - } - assert migrated == expected + def test_separate_automatic_and_manual_forces_language_if_given(self): + manual, automated = _separate_manual_and_automatic_versions( + self.action_dict, None, None, language='en' + ) + for formatted_item in manual: + assert formatted_item['language'] == 'en' + + def test_separate_automatic_and_manual_without_automatic_value(self): + manual, automatic = _separate_manual_and_automatic_versions( + self.action_dict, None, None + ) + assert len(manual) == 2 + assert len(automatic) == 0 - def test_determine_source_transcripts(self): - manual_transcripts = [] - automatic_transcripts = [] + @data( + # known language, date created, expected result uuid + # there is a transcript of the same language with an older date + ('de', '2024-12-31', 'uuid4'), + # there are transcripts of the same language but none older than the translation + ('de', '2023-01-01', 'uuid3'), + # there are no transcripts of the same language + ('fr', '2024-12-31', 'uuid1'), + # we don't know the source language but there are older transcripts + (None, '2024-12-31', 'uuid2'), + # we don't know the source language and there are no older transcripts + (None, '2023-01-01', 'uuid1'), + ) + @unpack + def test_determine_source_transcription( + self, source_language, date_created, expected_source_uuid + ): now = timezone.now() - for i in range(5): - manual = { - '_dateCreated': now - timedelta(days=i), + one_day_ago = now - timedelta(days=1) + jan_1_2024 = datetime(2024, 1, 1, tzinfo=timezone.utc) + jan_2_2024 = datetime(2024, 1, 2, tzinfo=timezone.utc) + transcripts = [ + { + '_uuid': 'uuid1', + '_dateCreated': now.isoformat(), 'language': 'en', - 'value': 'Value', - '_uuid': f'uuid-{i}-manual', - '_dateAccepted': None, - } - automatic = { - **manual, - '_uuid': f'uuid-{i}-automatic', - '_dateCreated': now - timedelta(days=i + 1), - } - manual_transcripts.append(manual) - automatic_transcripts.append(automatic) - # add an old transcript in a different language - manual_transcripts.append( + '_actionId': 'manual_transcription', + }, { - '_dateCreated': now - timedelta(days=5), - 'language': 'fr', - 'value': 'Value', - '_uuid': f'uuid-5-manual', - '_dateAccepted': None, - } - ) - most_recent_overall, most_recent_by_language = determine_source_transcripts( - manual_transcripts, automatic_transcripts + '_uuid': 'uuid2', + '_dateCreated': jan_1_2024.isoformat(), + 'language': 'en', + '_actionId': 'automatic_transcription', + }, + { + '_uuid': 'uuid3', + '_dateCreated': one_day_ago.isoformat(), + 'language': 'de', + '_actionId': 'manual_transcription', + }, + { + '_uuid': 'uuid4', + '_dateCreated': jan_2_2024.isoformat(), + 'language': 'de', + '_actionId': 'automatic_transcription', + }, + ] + translation_revision = {'_dateCreated': date_created} + source_transcript = _determine_source_transcript( + translation_revision, transcripts, automatic_source_language=source_language ) - assert most_recent_overall['_uuid'] == 'uuid-0-manual' - assert most_recent_overall['_actionId'] == 'manual_transcription' - assert most_recent_by_language['en']['_uuid'] == 'uuid-0-manual' - assert most_recent_by_language['en']['_actionId'] == 'manual_transcription' - assert most_recent_by_language['fr']['_uuid'] == 'uuid-5-manual' - assert most_recent_by_language['fr']['_actionId'] == 'manual_transcription' - - def test_migrate_translations(self): - pass - + assert source_transcript['_uuid'] == expected_source_uuid + # test the whole transformation process def test_migrate_submission_extra_to_supplemental(self): now = timezone.now() + one_day_ago = (now - timedelta(days=1)).isoformat() one_year_ago = (now - timedelta(days=365)).isoformat() - old_version = {'Audio_question': {'googlets': {'languageCode': 'en', - 'regionCode': None, - 'status': 'complete', - 'value': 'This is audio that I am trying to ' - 'transcribe.'}, - 'googletx': {'languageCode': 'es', - 'source': 'en', - 'status': 'complete', - 'value': 'Esto es un audio que estoy ' - 'intentando a transcribir.'}, - 'transcript': {'dateCreated': one_year_ago, - 'dateModified': now.isoformat(), - 'languageCode': 'en', - 'revisions': [{'dateModified': one_year_ago, - 'languageCode': 'en', - 'value': 'This is audio that ' - 'I am trying to ' - 'transcribe.'}, - {}], - 'value': 'This is audio that I am trying to ' - 'transcribe but i edited it.'}, - 'translation': {'es': {'dateCreated': one_year_ago, - 'dateModified': now.isoformat(), - 'languageCode': 'es', - 'revisions': [{'dateModified': one_year_ago, - 'languageCode': 'es', - 'value': 'Esto es un ' - 'audio que ' - 'estoy ' - 'intentando a ' - 'transcribir.'}], - 'value': 'Esto es un audio que ' - 'estoy intentando ' - 'transcribir pero yo lo edité'}}}} + a_year_and_a_day_ago = (now - timedelta(days=366)).isoformat() + old_version = { + 'Audio_question': { + 'googlets': { + 'languageCode': 'en', + 'regionCode': None, + 'status': 'complete', + 'value': 'This is audio that I am trying to ' 'transcribe.', + }, + 'googletx': { + 'languageCode': 'es', + 'source': 'en', + 'status': 'complete', + 'value': 'Esto es un audio que estoy ' 'intentando a transcribir.', + }, + 'transcript': { + 'dateCreated': one_day_ago, + 'dateModified': one_day_ago, + 'languageCode': 'en', + 'revisions': [ + { + 'dateModified': a_year_and_a_day_ago, + 'languageCode': 'en', + 'value': 'This is audio that ' + 'I am trying to ' + 'transcribe.', + }, + {}, + ], + 'value': 'This is audio that I am trying to ' + 'transcribe but i edited it.', + }, + 'translation': { + 'es': { + 'dateCreated': one_year_ago, + 'dateModified': now.isoformat(), + 'languageCode': 'es', + 'revisions': [ + { + 'dateModified': one_year_ago, + 'languageCode': 'es', + 'value': 'Esto es un ' + 'audio que ' + 'estoy ' + 'intentando a ' + 'transcribir.', + } + ], + 'value': 'Esto es un audio que ' + 'estoy intentando ' + 'transcribir pero yo lo edité', + } + }, + } + } - with patch('kobo.apps.subsequences.utils.versioning.generate_uuid_for_form', side_effect=['uuid1', 'uuid2', 'uuid3', 'uuid4']): + with patch( + 'kobo.apps.subsequences.utils.versioning.generate_uuid_for_form', + side_effect=['uuid1', 'uuid2', 'uuid3', 'uuid4'], + ): with freeze_time(now): migrated = migrate_submission_supplementals(old_version) @@ -223,11 +216,11 @@ def test_migrate_submission_extra_to_supplemental(self): '_version': '20250820', 'Audio_question': { 'automatic_transcription': { - '_dateCreated': one_year_ago, - '_dateModified': one_year_ago, + '_dateCreated': a_year_and_a_day_ago, + '_dateModified': a_year_and_a_day_ago, '_versions': [ { - '_dateCreated': one_year_ago, + '_dateCreated': a_year_and_a_day_ago, '_dateAccepted': now.isoformat(), '_uuid':'uuid2', 'language': 'en', @@ -245,22 +238,25 @@ def test_migrate_submission_extra_to_supplemental(self): { '_dateCreated': one_year_ago, '_dateAccepted': now.isoformat(), - '_dependency': {'_actionId': 'manual_transcription', - '_uuid': 'uuid1'}, - '_uuid':'uuid4', + '_dependency': { + '_actionId': 'automatic_transcription', + '_uuid': 'uuid2', + }, + '_uuid': 'uuid4', 'language': 'es', - 'value': 'Esto es un audio que estoy intentando a transcribir.', - 'status': 'complete' + 'value': 'Esto es un audio que estoy intentando a' + ' transcribir.', + 'status': 'complete', } ] } }, 'manual_transcription': { - '_dateCreated': now.isoformat(), - '_dateModified': now.isoformat(), + '_dateCreated': one_day_ago, + '_dateModified': one_day_ago, '_versions': [ { - '_dateCreated': now.isoformat(), + '_dateCreated': one_day_ago, '_dateAccepted': None, '_uuid':'uuid1', 'language': 'en', @@ -281,12 +277,13 @@ def test_migrate_submission_extra_to_supplemental(self): '_uuid': 'uuid1'}, '_uuid':'uuid3', 'language': 'es', - 'value': 'Esto es un audio que estoy intentando transcribir pero yo lo edité', + 'value': 'Esto es un audio que estoy intentando' + ' transcribir pero yo lo edité', } ] } }, } } - assert migrated == new_version # add assertion here + assert migrated == new_version diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index 15e82c3c26..fc45e5859d 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -4,8 +4,135 @@ from ..constants import SCHEMA_VERSIONS -class InvalidSupplementalFormat(Exception): - pass +def _add_translation_sources( + version_list, all_tagged_transcripts, automatic_translation_source_language=None +): + for translation in version_list: + # determine and record the most likely source transcript + source = _determine_source_transcript( + translation, + all_tagged_transcripts, + automatic_source_language=automatic_translation_source_language, + ) + translation['_dependency'] = { + '_uuid': source['_uuid'], + '_actionId': source['_actionId'], + } + + +def _combine_source_transcripts(manual_transcripts, automatic_transcripts): + # Combine manual and automatic transcripts and sort by dateCreated descending + # tag them with the action so we don't lose track + tagged_manual_transcripts = [ + {**transcript, '_actionId': 'manual_transcription'} + for transcript in manual_transcripts + ] + tagged_automatic_transcripts = [ + {**transcript, '_actionId': 'automatic_transcription'} + for transcript in automatic_transcripts + ] + + all_tagged_transcripts = [*tagged_manual_transcripts, *tagged_automatic_transcripts] + all_tagged_transcripts.sort(reverse=True, key=lambda d: d['_dateCreated']) + return all_tagged_transcripts + + +def _determine_source_transcript( + translation_revision, all_transcripts, automatic_source_language=None +): + if automatic_source_language: # we know the source language + transcripts_matching_language = [ + transcript + for transcript in all_transcripts + if transcript['language'] == automatic_source_language + ] + for transcript in transcripts_matching_language: + # is there a transcript in the source language created earlier than the + # translation? + if transcript['_dateCreated'] < translation_revision['_dateCreated']: + return transcript + # if not, is there *any* transcript in the source language? take the most + # recent one + if len(transcripts_matching_language) > 0: + return transcripts_matching_language[0] + else: + # is there a transcript older than the translation? + for transcript in all_transcripts: + if transcript['_dateCreated'] < translation_revision['_dateCreated']: + return transcript + # default to the most recent transcript + return all_transcripts[0] + + +def _new_revision_from_old(old_transcript_revision_dict: dict) -> dict | None: + # ignore bad data + if ( + 'languageCode' not in old_transcript_revision_dict + or 'value' not in old_transcript_revision_dict + ): + return None + return { + '_dateCreated': old_transcript_revision_dict.get('dateModified', None), + 'language': old_transcript_revision_dict['languageCode'], + 'value': old_transcript_revision_dict['value'], + '_uuid': generate_uuid_for_form(), + '_dateAccepted': None, + } + + +def _separate_manual_and_automatic_versions( + old_action_dictionary, + automatic_result_language, + automatic_result_value, + # translations have an expected language + language=None, +): + automatic_versions = [] + manual_versions = [] + latest_revision = { + key: val + for key, val in old_action_dictionary.items() + if key in ['value', 'languageCode', 'dateModified'] + } + # add the latest revision to the list of all revisions for easier processing + all_revisions = [latest_revision, *old_action_dictionary.get('revisions', [])] + for revision in all_revisions: + if language: + # force the expected language if given + revision['languageCode'] = language + revision_formatted = _new_revision_from_old(revision) + if revision_formatted is None: + continue + # if the language and value match that of the automatic result, + # assume this one was generated automatically + matches_automatic_result = ( + revision_formatted['language'] == automatic_result_language + and revision_formatted['value'] == automatic_result_value + ) + correct_version_list_to_append = ( + automatic_versions if matches_automatic_result else manual_versions + ) + if matches_automatic_result: + # automatic versions also need a status and a date accepted + revision_formatted['status'] = 'complete' + revision_formatted['_dateAccepted'] = timezone.now().isoformat() + correct_version_list_to_append.append(revision_formatted) + + # they should be sorted anyway, but just make sure in case the input values + # weren't sorted correctly + manual_versions.sort(reverse=True, key=lambda d: d['_dateCreated']) + automatic_versions.sort(reverse=True, key=lambda d: d['_dateCreated']) + + return manual_versions, automatic_versions + + +def _version_list_to_summary_dict(list_of_versions: list[dict]) -> dict: + return { + '_dateCreated': list_of_versions[-1]['_dateCreated'], + '_dateModified': list_of_versions[0]['_dateCreated'], + '_versions': list_of_versions, + } + def migrate_advanced_features(advanced_features: dict) -> dict | None: @@ -45,11 +172,7 @@ def migrate_advanced_features(advanced_features: dict) -> dict | None: return migrated_advanced_features -def set_version(schema: dict) -> dict: - schema['_version'] = SCHEMA_VERSIONS[0] - return schema - -def migrate_submission_supplementals(supplemental_data:dict) -> dict: +def migrate_submission_supplementals(supplemental_data: dict) -> dict | None: if supplemental_data.get('_version', None) == SCHEMA_VERSIONS[0]: return supplemental = { @@ -57,238 +180,78 @@ def migrate_submission_supplementals(supplemental_data:dict) -> dict: } for question_xpath, action_results in supplemental_data.items(): question_results_by_action = {} - automatic_transcript_language, automatic_transcript_result = ( - get_automatic_transcription(action_results) - ) - manual_transcripts, automatic_transcripts = separate_transcriptions( - action_results.get('transcript', None), - automatic_transcript_language, - automatic_transcript_result, + + # get all the automatic result data + automatic_transcript = action_results.get('googlets', {}) + automatic_transcript_language = automatic_transcript.get('languageCode', None) + automatic_transcript_value = automatic_transcript.get('value', None) + automatic_translation = action_results.get('googletx', {}) + automatic_translation_language = automatic_translation.get('languageCode') + automatic_translation_value = automatic_translation.get('value') + automatic_translation_source_language = automatic_translation.get('source') + + # divide transcripts into manual and automatic + manual_transcripts, automatic_transcripts = ( + _separate_manual_and_automatic_versions( + action_results.get('transcript', None), + automatic_transcript_language, + automatic_transcript_value, + ) ) - # should already be sorted by date created descending, but just in case - manual_transcripts.sort(reverse=True, key=lambda d: d['_dateCreated']) - automatic_transcripts.sort(reverse=True, key=lambda d: d['_dateCreated']) if len(manual_transcripts) > 0: - question_results_by_action['manual_transcription'] = { - '_dateCreated': manual_transcripts[-1]['_dateCreated'], - '_dateModified': manual_transcripts[0]['_dateCreated'], - '_versions': manual_transcripts, - } + question_results_by_action['manual_transcription'] = ( + _version_list_to_summary_dict(manual_transcripts) + ) if len(automatic_transcripts) > 0: - question_results_by_action['automatic_transcription'] = { - '_dateCreated': automatic_transcripts[-1]['_dateCreated'], - '_dateModified': automatic_transcripts[0]['_dateCreated'], - '_versions': automatic_transcripts, - } - - # translation - # determine what to use as the source transcript - most_recent_transcript, most_recent_transcript_by_language = ( - determine_source_transcripts(manual_transcripts, automatic_transcripts) - ) - ( - automatic_translation_source_language, - automatic_translation_language, - automatic_translation_value, - ) = get_automatic_translation(action_results) + question_results_by_action['automatic_transcription'] = ( + _version_list_to_summary_dict(automatic_transcripts) + ) + # process translations translations_dict = action_results.get('translation', {}) automatic_translations = {} manual_translations = {} + + # divide translations into manual and automatic by language for language_code, translations in translations_dict.items(): - manual_translations_for_language, automatic_translations_for_language = separate_translations( - language_code, - translations, - automatic_translation_source_language, - automatic_translation_language, - automatic_translation_value, - most_recent_transcript, - most_recent_transcript_by_language, + manual_translations_for_language, automatic_translations_for_language = ( + _separate_manual_and_automatic_versions( + translations, + automatic_translation_language, + automatic_translation_value, + language_code, + ) + ) + + all_tagged_transcripts = _combine_source_transcripts( + manual_transcripts, automatic_transcripts ) if len(automatic_translations_for_language) > 0: - automatic_translations_for_language.sort(reverse=True, key =lambda x: x['_dateCreated']) - automatic_translations[language_code] = { - '_dateCreated': automatic_translations_for_language[-1]['_dateCreated'], - '_dateModified': automatic_translations_for_language[0]['_dateCreated'] - } - automatic_translations[language_code]['_versions'] = automatic_translations_for_language + _add_translation_sources( + automatic_translations_for_language, + all_tagged_transcripts, + automatic_translation_source_language, + ) + automatic_translations[language_code] = _version_list_to_summary_dict( + automatic_translations_for_language + ) if len(manual_translations_for_language) > 0: - manual_translations_for_language.sort(reverse=True, key =lambda x: x['_dateCreated']) - manual_translations[language_code] = { - '_dateCreated': manual_translations_for_language[-1]['_dateCreated'], - '_dateModified': manual_translations_for_language[0]['_dateCreated'] - } - manual_translations[language_code]['_versions'] = manual_translations_for_language + _add_translation_sources( + manual_translations_for_language, all_tagged_transcripts + ) + manual_translations[language_code] = _version_list_to_summary_dict( + manual_translations_for_language + ) if automatic_translations != {}: question_results_by_action['automatic_translation'] = automatic_translations - question_results_by_action['manual_translation'] = manual_translations + if manual_translations != {}: + question_results_by_action['manual_translation'] = manual_translations supplemental[question_xpath] = question_results_by_action - return supplemental -def determine_source_transcripts(manual_transcripts, automatic_transcripts): - # First combine manual and automatic transcripts and sort by dateCreated descending - # tag them with the action so we don't lose track - tagged_manual_transcripts = [ - {**transcript, '_actionId': 'manual_transcription'} - for transcript in manual_transcripts - ] - tagged_automatic_transcripts = [ - {**transcript, '_actionId': 'automatic_translation'} - for transcript in automatic_transcripts - ] - - all_tagged_transcripts = [*tagged_manual_transcripts, *tagged_automatic_transcripts] - all_tagged_transcripts.sort(reverse=True, key=lambda d: d['_dateCreated']) - - # take the most recent transcript, manual or automatic, by language - most_recent_transcript_uuids_by_language = {} - for transcript in all_tagged_transcripts: - if most_recent_transcript_uuids_by_language.get(transcript['language']) is None: - most_recent_transcript_uuids_by_language[transcript['language']] = { - '_uuid': transcript['_uuid'], - '_actionId': transcript['_actionId'], - } - - # we don't always know the source language of a translation, so also get the most recent transcript overall - most_recent_transcript_overall = all_tagged_transcripts[0] - most_recent_transcript_overall = { - '_uuid': most_recent_transcript_overall['_uuid'], - '_actionId': most_recent_transcript_overall['_actionId'], - } - return most_recent_transcript_overall, most_recent_transcript_uuids_by_language - - -def get_automatic_transcription( - action_results: dict, -) -> tuple[str | None, str | None] | None: - googlets = action_results.get('googlets', {}) - return googlets.get('languageCode', None), googlets.get('value', None) - -def get_automatic_translation(action_results:dict): - googletx = action_results.get('googletx', {}) - return ( - googletx.get('source', None), - googletx.get('languageCode', None), - googletx.get('value', None), - ) - - -def new_revision_from_old(old_transcript_revision_dict: dict) -> dict | None: - # ignore bad data - if ( - 'languageCode' not in old_transcript_revision_dict - or 'value' not in old_transcript_revision_dict - ): - return None - return { - '_dateCreated': old_transcript_revision_dict.get('dateModified', None), - 'language': old_transcript_revision_dict['languageCode'], - 'value': old_transcript_revision_dict['value'], - '_uuid': generate_uuid_for_form(), - '_dateAccepted': None, - } - - -def separate_transcriptions( - transcription_dict: dict, - automatic_transcript_language: str = None, - automatic_transcript_value: str = None, -) -> tuple[list, list]: - if not transcription_dict: - return [], [] - automatic_transcriptions = [] - manual_transcriptions = [] - latest_revision = new_revision_from_old(transcription_dict) - if latest_revision: - if ( - latest_revision['value'] == automatic_transcript_value - and latest_revision['language'] == automatic_transcript_language - ): - latest_revision['status'] = 'complete' - latest_revision['_dateAccepted'] = timezone.now().isoformat() - automatic_transcriptions.append(latest_revision) - else: - manual_transcriptions.append(latest_revision) - - for revision in transcription_dict.get('revisions', []): - revision_formatted = new_revision_from_old(revision) - if revision_formatted is None: - continue - if ( - revision_formatted['language'] == automatic_transcript_language - and revision['value'] == automatic_transcript_value - ): - revision_formatted['status'] = 'complete' - revision_formatted['_dateAccepted'] = timezone.now().isoformat() - automatic_transcriptions.append(revision_formatted) - else: - manual_transcriptions.append(revision_formatted) - return manual_transcriptions, automatic_transcriptions - - -def separate_translations( - language, - translation_dict, - automatic_translation_source_language: str = None, - automatic_translation_language: str = None, - automatic_translation_value: str = None, - most_recent_transcript=None, - most_recent_transcript_by_language=None, -) -> tuple[list, list]: - """ - {'es': {'dateCreated': '2025-10-22T14:30:38Z', - 'dateModified': '2025-10-22T17:10:23Z', - 'languageCode': 'es', - 'revisions': [{'dateModified': '2025-10-22T14:30:38Z', - 'languageCode': 'es', - 'value': 'Este es un ' - 'audio que ' - 'estoy ' - 'intentando ' - 'transcribir.'}], - 'value': 'Este es un audio que ' - 'estoy intentando ' - 'transcribir pero yo lo edité'}} - """ - automatic_translations = [] - manual_translations = [] - latest_revision = new_revision_from_old(translation_dict) - if latest_revision: - if ( - latest_revision['value'] == automatic_translation_value - and language == automatic_translation_language - ): - latest_revision['status'] = 'complete' - latest_revision['_dateAccepted'] = timezone.now().isoformat() - source = most_recent_transcript_by_language.get( - automatic_translation_source_language, most_recent_transcript - ) - latest_revision['_dependency'] = source - automatic_translations.append(latest_revision) - else: - latest_revision['_dependency'] = most_recent_transcript - manual_translations.append(latest_revision) - - for revision in translation_dict.get('revisions', []): - revision_formatted = new_revision_from_old(revision) - if revision_formatted is None: - continue - if ( - language == automatic_translation_language - and revision['value'] == automatic_translation_value - ): - revision_formatted['status'] = 'complete' - revision_formatted['_dateAccepted'] = timezone.now().isoformat() - source = most_recent_transcript_by_language.get( - automatic_translation_source_language, most_recent_transcript - ) - revision_formatted['_dependency'] = source - automatic_translations.append(revision_formatted) - else: - revision_formatted['_dependency'] = most_recent_transcript - manual_translations.append(revision_formatted) - return manual_translations, automatic_translations +def set_version(schema: dict) -> dict: + schema['_version'] = SCHEMA_VERSIONS[0] + return schema From c638c6210f78bbee555f5ff1c7cf8fc16c40e78a Mon Sep 17 00:00:00 2001 From: rgraber Date: Tue, 28 Oct 2025 15:29:12 -0400 Subject: [PATCH 10/18] fixup!: accidental change --- kobo/apps/subsequences/tests/test_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kobo/apps/subsequences/tests/test_models.py b/kobo/apps/subsequences/tests/test_models.py index 81bf26f267..6e49e1c530 100644 --- a/kobo/apps/subsequences/tests/test_models.py +++ b/kobo/apps/subsequences/tests/test_models.py @@ -337,4 +337,3 @@ def test_revise_data_raise_error_wrong_question_name(self): }, }, ) - From 73da17c94ac80e337351e67eb4058bdd5c5e26b8 Mon Sep 17 00:00:00 2001 From: rgraber Date: Tue, 28 Oct 2025 15:30:28 -0400 Subject: [PATCH 11/18] fixup!: accidental change --- kobo/settings/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kobo/settings/base.py b/kobo/settings/base.py index e64acfefe5..03c4d13887 100644 --- a/kobo/settings/base.py +++ b/kobo/settings/base.py @@ -1587,7 +1587,7 @@ def dj_stripe_request_callback_method(): # Google Cloud Storage # Not fully supported as a generic storage backend -GS_BUCKET_NAME = 'kobo-transcription-test' #env.str('GS_BUCKET_NAME', None) +GS_BUCKET_NAME = env.str('GS_BUCKET_NAME', None) """ Django error logging configuration """ From 397d8839f1de553d339fb8fa469bb5ea767f6b9f Mon Sep 17 00:00:00 2001 From: rgraber Date: Wed, 29 Oct 2025 08:31:55 -0400 Subject: [PATCH 12/18] fixup!: new uuid --- kobo/apps/subsequences/tests/test_versioning.py | 2 +- kobo/apps/subsequences/utils/versioning.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/kobo/apps/subsequences/tests/test_versioning.py b/kobo/apps/subsequences/tests/test_versioning.py index 1ebce79aed..e45b0e00a9 100644 --- a/kobo/apps/subsequences/tests/test_versioning.py +++ b/kobo/apps/subsequences/tests/test_versioning.py @@ -206,7 +206,7 @@ def test_migrate_submission_extra_to_supplemental(self): } with patch( - 'kobo.apps.subsequences.utils.versioning.generate_uuid_for_form', + 'kobo.apps.subsequences.utils.versioning.uuid.uuid4', side_effect=['uuid1', 'uuid2', 'uuid3', 'uuid4'], ): with freeze_time(now): diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index fc45e5859d..95436a561b 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -1,6 +1,7 @@ +import uuid + from django.utils import timezone -from ...openrosa.libs.utils.model_tools import generate_uuid_for_form from ..constants import SCHEMA_VERSIONS @@ -75,7 +76,7 @@ def _new_revision_from_old(old_transcript_revision_dict: dict) -> dict | None: '_dateCreated': old_transcript_revision_dict.get('dateModified', None), 'language': old_transcript_revision_dict['languageCode'], 'value': old_transcript_revision_dict['value'], - '_uuid': generate_uuid_for_form(), + '_uuid': uuid.uuid4(), '_dateAccepted': None, } From 3fd8f593963657a5421aae05ffe95816c3d66db4 Mon Sep 17 00:00:00 2001 From: rgraber Date: Wed, 29 Oct 2025 08:44:17 -0400 Subject: [PATCH 13/18] fixup!: format --- kobo/apps/subsequences/tests/test_versioning.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kobo/apps/subsequences/tests/test_versioning.py b/kobo/apps/subsequences/tests/test_versioning.py index e45b0e00a9..52961f17b8 100644 --- a/kobo/apps/subsequences/tests/test_versioning.py +++ b/kobo/apps/subsequences/tests/test_versioning.py @@ -222,10 +222,9 @@ def test_migrate_submission_extra_to_supplemental(self): { '_dateCreated': a_year_and_a_day_ago, '_dateAccepted': now.isoformat(), - '_uuid':'uuid2', + '_uuid': 'uuid2', 'language': 'en', - 'value': 'This is audio that I am trying to ' - 'transcribe.', + 'value': 'This is audio that I am trying to transcribe.', 'status': 'complete', } ] @@ -258,7 +257,7 @@ def test_migrate_submission_extra_to_supplemental(self): { '_dateCreated': one_day_ago, '_dateAccepted': None, - '_uuid':'uuid1', + '_uuid': 'uuid1', 'language': 'en', 'value': 'This is audio that I am trying to ' 'transcribe but i edited it.', @@ -275,7 +274,7 @@ def test_migrate_submission_extra_to_supplemental(self): '_dateAccepted': None, '_dependency': {'_actionId': 'manual_transcription', '_uuid': 'uuid1'}, - '_uuid':'uuid3', + '_uuid': 'uuid3', 'language': 'es', 'value': 'Esto es un audio que estoy intentando' ' transcribir pero yo lo edité', @@ -286,4 +285,3 @@ def test_migrate_submission_extra_to_supplemental(self): } } assert migrated == new_version - From 746766cfb250381da883f3c44851460e021aaa1e Mon Sep 17 00:00:00 2001 From: rgraber Date: Wed, 29 Oct 2025 09:52:24 -0400 Subject: [PATCH 14/18] fixup!: stuff --- kobo/apps/subsequences/schemas.py | 2 +- kobo/apps/subsequences/utils/versioning.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kobo/apps/subsequences/schemas.py b/kobo/apps/subsequences/schemas.py index 333ff79c32..2ef53865c4 100644 --- a/kobo/apps/subsequences/schemas.py +++ b/kobo/apps/subsequences/schemas.py @@ -29,7 +29,7 @@ def validate_submission_supplement(asset: 'kpi.models.Asset', supplement: dict): - jsonschema.validate(get_submission_supplement_schema(asset), supplement) + jsonschema.validate(supplement, get_submission_supplement_schema(asset)) def get_submission_supplement_schema(asset: 'kpi.models.Asset') -> dict: diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index 95436a561b..b5af63036f 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -76,7 +76,7 @@ def _new_revision_from_old(old_transcript_revision_dict: dict) -> dict | None: '_dateCreated': old_transcript_revision_dict.get('dateModified', None), 'language': old_transcript_revision_dict['languageCode'], 'value': old_transcript_revision_dict['value'], - '_uuid': uuid.uuid4(), + '_uuid': str(uuid.uuid4()), '_dateAccepted': None, } From 263227f4458f901a853eb22daa70295d2bc5d583 Mon Sep 17 00:00:00 2001 From: rgraber Date: Fri, 31 Oct 2025 08:34:34 -0400 Subject: [PATCH 15/18] fixup!: changes from review --- kobo/apps/subsequences/utils/versioning.py | 247 ++++++++++----------- 1 file changed, 123 insertions(+), 124 deletions(-) diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index b5af63036f..b4f1b02656 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -5,6 +5,128 @@ from ..constants import SCHEMA_VERSIONS +def migrate_advanced_features(advanced_features: dict) -> dict | None: + + if advanced_features.get('_version') == SCHEMA_VERSIONS[0]: + return + + migrated_advanced_features = {'_version': SCHEMA_VERSIONS[0], '_actionConfigs': {}} + + actionConfigs = migrated_advanced_features['_actionConfigs'] + for key, value in advanced_features.items(): + if ( + key == 'transcript' + and value + and 'languages' in value + and value['languages'] + ): + actionConfigs['manual_transcription'] = [ + {'language': language} for language in value['languages'] + ] + + if ( + key == 'translation' + and value + and 'languages' in value + and value['languages'] + ): + actionConfigs['manual_translation'] = [ + {'language': language} for language in value['languages'] + ] + + if key == 'qual': + raise NotImplementedError + + return migrated_advanced_features + + +def migrate_submission_supplementals(supplemental_data: dict) -> dict | None: + if supplemental_data.get('_version') == SCHEMA_VERSIONS[0]: + return + supplemental = { + '_version': SCHEMA_VERSIONS[0], + } + for question_xpath, action_results in supplemental_data.items(): + question_results_by_action = {} + + # get all the automatic result data + automatic_transcript = action_results.get('googlets', {}) + automatic_transcript_language = automatic_transcript.get('languageCode') + automatic_transcript_value = automatic_transcript.get('value') + automatic_translation = action_results.get('googletx', {}) + automatic_translation_language = automatic_translation.get('languageCode') + automatic_translation_value = automatic_translation.get('value') + automatic_translation_source_language = automatic_translation.get('source') + + # divide transcripts into manual and automatic + manual_transcripts, automatic_transcripts = ( + _separate_manual_and_automatic_versions( + action_results.get('transcript'), + automatic_transcript_language, + automatic_transcript_value, + ) + ) + + if len(manual_transcripts) > 0: + question_results_by_action['manual_transcription'] = ( + _version_list_to_summary_dict(manual_transcripts) + ) + if len(automatic_transcripts) > 0: + question_results_by_action['automatic_google_transcription'] = ( + _version_list_to_summary_dict(automatic_transcripts) + ) + + # process translations + translations_dict = action_results.get('translation', {}) + automatic_translations = {} + manual_translations = {} + + # divide translations into manual and automatic by language + for language_code, translations in translations_dict.items(): + manual_translations_for_language, automatic_translations_for_language = ( + _separate_manual_and_automatic_versions( + translations, + automatic_translation_language, + automatic_translation_value, + language_code, + ) + ) + + all_tagged_transcripts = _combine_source_transcripts( + manual_transcripts, automatic_transcripts + ) + if len(automatic_translations_for_language) > 0: + _add_translation_sources( + automatic_translations_for_language, + all_tagged_transcripts, + automatic_translation_source_language, + ) + automatic_translations[language_code] = _version_list_to_summary_dict( + automatic_translations_for_language + ) + if len(manual_translations_for_language) > 0: + _add_translation_sources( + manual_translations_for_language, all_tagged_transcripts + ) + manual_translations[language_code] = _version_list_to_summary_dict( + manual_translations_for_language + ) + if automatic_translations != {}: + question_results_by_action['automatic_google_translation'] = ( + automatic_translations + ) + if manual_translations != {}: + question_results_by_action['manual_translation'] = manual_translations + supplemental[question_xpath] = question_results_by_action + + return supplemental + + +def set_version(schema: dict) -> dict: + schema['_version'] = SCHEMA_VERSIONS[0] + return schema + + def _add_translation_sources( version_list, all_tagged_transcripts, automatic_translation_source_language=None ): @@ -73,7 +195,7 @@ def _new_revision_from_old(old_transcript_revision_dict: dict) -> dict | None: ): return None return { - '_dateCreated': old_transcript_revision_dict.get('dateModified', None), + '_dateCreated': old_transcript_revision_dict.get('dateModified'), 'language': old_transcript_revision_dict['languageCode'], 'value': old_transcript_revision_dict['value'], '_uuid': str(uuid.uuid4()), @@ -133,126 +255,3 @@ def _version_list_to_summary_dict(list_of_versions: list[dict]) -> dict: '_dateModified': list_of_versions[0]['_dateCreated'], '_versions': list_of_versions, } - - -def migrate_advanced_features(advanced_features: dict) -> dict | None: - - if advanced_features.get('_version') == SCHEMA_VERSIONS[0]: - return - - migrated_advanced_features = { - '_version': SCHEMA_VERSIONS[0], - '_actionConfigs': {} - } - - actionConfigs = migrated_advanced_features['_actionConfigs'] - for key, value in advanced_features.items(): - if ( - key == 'transcript' - and value - and 'languages' in value - and value['languages'] - ): - actionConfigs['manual_transcription'] = [ - {'language': language} for language in value['languages'] - ] - - if ( - key == 'translation' - and value - and 'languages' in value - and value['languages'] - ): - actionConfigs['manual_translation'] = [ - {'language': language} for language in value['languages'] - ] - - if key == 'qual': - raise NotImplementedError - - return migrated_advanced_features - - -def migrate_submission_supplementals(supplemental_data: dict) -> dict | None: - if supplemental_data.get('_version', None) == SCHEMA_VERSIONS[0]: - return - supplemental = { - '_version': SCHEMA_VERSIONS[0], - } - for question_xpath, action_results in supplemental_data.items(): - question_results_by_action = {} - - # get all the automatic result data - automatic_transcript = action_results.get('googlets', {}) - automatic_transcript_language = automatic_transcript.get('languageCode', None) - automatic_transcript_value = automatic_transcript.get('value', None) - automatic_translation = action_results.get('googletx', {}) - automatic_translation_language = automatic_translation.get('languageCode') - automatic_translation_value = automatic_translation.get('value') - automatic_translation_source_language = automatic_translation.get('source') - - # divide transcripts into manual and automatic - manual_transcripts, automatic_transcripts = ( - _separate_manual_and_automatic_versions( - action_results.get('transcript', None), - automatic_transcript_language, - automatic_transcript_value, - ) - ) - - if len(manual_transcripts) > 0: - question_results_by_action['manual_transcription'] = ( - _version_list_to_summary_dict(manual_transcripts) - ) - if len(automatic_transcripts) > 0: - question_results_by_action['automatic_transcription'] = ( - _version_list_to_summary_dict(automatic_transcripts) - ) - - # process translations - translations_dict = action_results.get('translation', {}) - automatic_translations = {} - manual_translations = {} - - # divide translations into manual and automatic by language - for language_code, translations in translations_dict.items(): - manual_translations_for_language, automatic_translations_for_language = ( - _separate_manual_and_automatic_versions( - translations, - automatic_translation_language, - automatic_translation_value, - language_code, - ) - ) - - all_tagged_transcripts = _combine_source_transcripts( - manual_transcripts, automatic_transcripts - ) - if len(automatic_translations_for_language) > 0: - _add_translation_sources( - automatic_translations_for_language, - all_tagged_transcripts, - automatic_translation_source_language, - ) - automatic_translations[language_code] = _version_list_to_summary_dict( - automatic_translations_for_language - ) - if len(manual_translations_for_language) > 0: - _add_translation_sources( - manual_translations_for_language, all_tagged_transcripts - ) - manual_translations[language_code] = _version_list_to_summary_dict( - manual_translations_for_language - ) - if automatic_translations != {}: - question_results_by_action['automatic_translation'] = automatic_translations - if manual_translations != {}: - question_results_by_action['manual_translation'] = manual_translations - supplemental[question_xpath] = question_results_by_action - - return supplemental - - -def set_version(schema: dict) -> dict: - schema['_version'] = SCHEMA_VERSIONS[0] - return schema From 8569da140d777ee1cf71aac692862908b035786a Mon Sep 17 00:00:00 2001 From: rgraber Date: Tue, 4 Nov 2025 08:45:50 -0500 Subject: [PATCH 16/18] fixup!: action names --- kobo/apps/subsequences/tests/test_versioning.py | 6 +++--- kobo/apps/subsequences/utils/versioning.py | 2 +- .../tests/test_submission_extras_api_post.py | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/kobo/apps/subsequences/tests/test_versioning.py b/kobo/apps/subsequences/tests/test_versioning.py index 52961f17b8..a29a662b95 100644 --- a/kobo/apps/subsequences/tests/test_versioning.py +++ b/kobo/apps/subsequences/tests/test_versioning.py @@ -215,7 +215,7 @@ def test_migrate_submission_extra_to_supplemental(self): new_version = { '_version': '20250820', 'Audio_question': { - 'automatic_transcription': { + 'automatic_google_transcription': { '_dateCreated': a_year_and_a_day_ago, '_dateModified': a_year_and_a_day_ago, '_versions': [ @@ -229,7 +229,7 @@ def test_migrate_submission_extra_to_supplemental(self): } ] }, - 'automatic_translation': { + 'automatic_google_translation': { 'es': { '_dateCreated': one_year_ago, '_dateModified': one_year_ago, @@ -238,7 +238,7 @@ def test_migrate_submission_extra_to_supplemental(self): '_dateCreated': one_year_ago, '_dateAccepted': now.isoformat(), '_dependency': { - '_actionId': 'automatic_transcription', + '_actionId': 'automatic_google_transcription', '_uuid': 'uuid2', }, '_uuid': 'uuid4', diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index b4f1b02656..62a9e8ba00 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -151,7 +151,7 @@ def _combine_source_transcripts(manual_transcripts, automatic_transcripts): for transcript in manual_transcripts ] tagged_automatic_transcripts = [ - {**transcript, '_actionId': 'automatic_transcription'} + {**transcript, '_actionId': 'automatic_google_transcription'} for transcript in automatic_transcripts ] diff --git a/kobo/apps/subsequences__old/tests/test_submission_extras_api_post.py b/kobo/apps/subsequences__old/tests/test_submission_extras_api_post.py index 73a86bd91e..97fee2ff9f 100644 --- a/kobo/apps/subsequences__old/tests/test_submission_extras_api_post.py +++ b/kobo/apps/subsequences__old/tests/test_submission_extras_api_post.py @@ -332,6 +332,7 @@ def setUp(self): 'values': ['q1'], } }) + breakpoint() self.act1 = next(self.asset.get_advanced_feature_instances()) def test_simplest(self): From e2a43ed93a92c9db85df71766630c19f877186c5 Mon Sep 17 00:00:00 2001 From: Rebecca Graber Date: Wed, 5 Nov 2025 15:18:20 -0500 Subject: [PATCH 17/18] fixup: accidental change --- .../subsequences__old/tests/test_submission_extras_api_post.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kobo/apps/subsequences__old/tests/test_submission_extras_api_post.py b/kobo/apps/subsequences__old/tests/test_submission_extras_api_post.py index 97fee2ff9f..73a86bd91e 100644 --- a/kobo/apps/subsequences__old/tests/test_submission_extras_api_post.py +++ b/kobo/apps/subsequences__old/tests/test_submission_extras_api_post.py @@ -332,7 +332,6 @@ def setUp(self): 'values': ['q1'], } }) - breakpoint() self.act1 = next(self.asset.get_advanced_feature_instances()) def test_simplest(self): From 65881505c88ea86f38ed24ba8683071ad2a35b4e Mon Sep 17 00:00:00 2001 From: rgraber Date: Thu, 13 Nov 2025 08:19:17 -0500 Subject: [PATCH 18/18] fixup!: auto accept manual --- kobo/apps/subsequences/tests/test_versioning.py | 6 +++--- kobo/apps/subsequences/utils/versioning.py | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/kobo/apps/subsequences/tests/test_versioning.py b/kobo/apps/subsequences/tests/test_versioning.py index a29a662b95..b84efc1824 100644 --- a/kobo/apps/subsequences/tests/test_versioning.py +++ b/kobo/apps/subsequences/tests/test_versioning.py @@ -49,7 +49,7 @@ def test_new_revision_from_old(self): assert result['language'] == old['languageCode'] assert result['_dateCreated'] == old['dateModified'] assert result['_uuid'] is not None - assert result['_dateAccepted'] is None + assert result['_dateAccepted'] == now.isoformat() def test_new_transcript_revision_from_old_returns_none_for_bad_data(self): old = {'badly': 'formatted'} @@ -256,7 +256,7 @@ def test_migrate_submission_extra_to_supplemental(self): '_versions': [ { '_dateCreated': one_day_ago, - '_dateAccepted': None, + '_dateAccepted': now.isoformat(), '_uuid': 'uuid1', 'language': 'en', 'value': 'This is audio that I am trying to ' @@ -271,7 +271,7 @@ def test_migrate_submission_extra_to_supplemental(self): '_versions': [ { '_dateCreated': now.isoformat(), - '_dateAccepted': None, + '_dateAccepted': now.isoformat(), '_dependency': {'_actionId': 'manual_transcription', '_uuid': 'uuid1'}, '_uuid': 'uuid3', diff --git a/kobo/apps/subsequences/utils/versioning.py b/kobo/apps/subsequences/utils/versioning.py index 62a9e8ba00..9d29d38b9c 100644 --- a/kobo/apps/subsequences/utils/versioning.py +++ b/kobo/apps/subsequences/utils/versioning.py @@ -188,6 +188,7 @@ def _determine_source_transcript( def _new_revision_from_old(old_transcript_revision_dict: dict) -> dict | None: + now = timezone.now().isoformat() # ignore bad data if ( 'languageCode' not in old_transcript_revision_dict @@ -199,7 +200,8 @@ def _new_revision_from_old(old_transcript_revision_dict: dict) -> dict | None: 'language': old_transcript_revision_dict['languageCode'], 'value': old_transcript_revision_dict['value'], '_uuid': str(uuid.uuid4()), - '_dateAccepted': None, + # all preexisting translations/transcripts are considered accepted + '_dateAccepted': now, } @@ -236,9 +238,8 @@ def _separate_manual_and_automatic_versions( automatic_versions if matches_automatic_result else manual_versions ) if matches_automatic_result: - # automatic versions also need a status and a date accepted + # automatic versions also need a status revision_formatted['status'] = 'complete' - revision_formatted['_dateAccepted'] = timezone.now().isoformat() correct_version_list_to_append.append(revision_formatted) # they should be sorted anyway, but just make sure in case the input values