Skip to content

Commit dfebdcc

Browse files
authored
Merge pull request #19 from epoch8/fix-drop-duplicates-on-update-from-ls
Drop Duplicates on fetch from LS + test
2 parents 8b0c5e0 + 6f0373a commit dfebdcc

File tree

4 files changed

+59
-1
lines changed

4 files changed

+59
-1
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# 0.3.1
2+
* Add drop_duplicates on fetch data from LabelStudio (get_annotations_from_ls)
3+
14
# 0.3.0
25

36
* Update datapipe-core version (0.13.0-alpha.4)

datapipe_label_studio_lite/pipeline.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,8 @@ def _cleanup(values):
383383
],
384384
}
385385
)
386+
# Удаление возможных дубликатов из LabelStudio.
387+
output_df = output_df.drop_duplicates(subset=self.primary_keys, keep="last")
386388
output_dts[0].store_chunk(output_df)
387389

388390
if len(updated_ats) > 0:

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "datapipe-label-studio-lite"
3-
version = "0.3.0"
3+
version = "0.3.1"
44
description = ""
55
authors = [
66
"Alexander Kozlov <bobokvsky@epoch8.co>",

tests/test_pipeline.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -745,3 +745,56 @@ def _gen2():
745745
# Предсказания не должны уйти
746746
# if include_predictions:
747747
# assert len(df_ls.loc[idx, 'predictions']) == include_prepredictions + include_predictions
748+
749+
750+
@parametrize_with_cases(
751+
"ds, catalog, steps, project_title, include_preannotations, include_prepredictions, "
752+
"include_predictions, label_studio_session, delete_unannotated_tasks_only_on_update",
753+
cases=CasesLabelStudio,
754+
)
755+
def test_ls_moderation_with_duplicates_in_ls(
756+
ds: DataStore,
757+
catalog: Catalog,
758+
steps: List[DatatableTransformStep],
759+
project_title: str,
760+
include_preannotations: bool,
761+
include_prepredictions: bool,
762+
include_predictions: bool,
763+
label_studio_session: label_studio_sdk.Client,
764+
delete_unannotated_tasks_only_on_update: bool,
765+
):
766+
# This should be ok (project will be created, but without data)
767+
run_steps(ds, steps)
768+
run_steps(ds, steps)
769+
770+
# Загружаем данные для задач в LS во входную таблицу.
771+
do_batch_generate(
772+
func=gen_data_df,
773+
ds=ds,
774+
output_dts=[ds.get_table("ls_input_data_raw")],
775+
)
776+
777+
# Добавляем дубликаты задач напрямую в проект LS.
778+
tasks_duplicates_to_add = [
779+
{
780+
"data": {
781+
"id": "task_1",
782+
"text": "task_1_new_text"
783+
}
784+
},
785+
{
786+
"data": {
787+
"id": "task_2",
788+
"text": "task_2_new_text"
789+
}
790+
}
791+
]
792+
project = get_project_by_title(label_studio_session, project_title)
793+
project.import_tasks(tasks=tasks_duplicates_to_add)
794+
795+
# Запускаем трансформацию.
796+
run_steps(ds, steps)
797+
798+
# Проверяем количество задач в LS и данных в выходной таблице трубы.
799+
assert len(project.get_tasks()) == TASKS_COUNT + len(tasks_duplicates_to_add)
800+
assert len(ds.get_table("ls_output").get_data()) == TASKS_COUNT

0 commit comments

Comments
 (0)