remove deletion operation in es connector (#570)

bryan-unstructured · web-flow · commit bad218cf5f1c · 2025-07-30T14:47:46.000-04:00
in ES connector, function delete_by_record_id is not necessary, because when document_id already exists in the index, we want to replace the record with new content, which is equivalent to upsert operation. deleting existing records is dangerous because it may remove unintended content inadvertently. For more details see: https://linear.app/unstructured/issue/ENG-360/upsert-in-elastic-incorrectly-implemented
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,14 +1,18 @@
+## 1.1.3
+
+* **Fix: Remove unnecessary deletion operation in ES connector**
+
 ## 1.1.2
 
-* **Fix**: DeltaTableConnectionConfig default assignment is compliant with stricter typing in Pydantic
+* **Fix: DeltaTableConnectionConfig default assignment is compliant with stricter typing in Pydantic**
 
 ## 1.1.1
 
 * **Fix: Update examples**
 
 ## 1.1.0
 
-* **Feature**: Embedding with OpenAI (or Azure OpenAI) can trust custom certificate authority by specifying environment variable REQUESTS_CA_BUNDLE.
+* **Feature: Embedding with OpenAI (or Azure OpenAI) can trust custom certificate authority by specifying environment variable REQUESTS_CA_BUNDLE.**
 
 ## 1.0.59
 
diff --git a/test/integration/connectors/elasticsearch/test_elasticsearch.py b/test/integration/connectors/elasticsearch/test_elasticsearch.py
@@ -280,11 +280,6 @@ async def test_elasticsearch_destination(
     with get_client() as client:
         validate_count(client=client, expected_count=expected_count, index_name=destination_index)
 
-    # Rerun and make sure the same documents get updated
-    uploader.run(path=staged_filepath, file_data=file_data)
-    with get_client() as client:
-        validate_count(client=client, expected_count=expected_count, index_name=destination_index)
-
 
 @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
 def test_elasticsearch_destination_precheck_fail():
@@ -333,3 +328,50 @@ def test_elasticsearch_stager(
         stager=stager,
         tmp_dir=tmp_path,
     )
+
+
+@pytest.mark.asyncio
+@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, NOSQL_TAG)
+async def test_elasticsearch_upsert_destination(
+    upload_file: Path,
+    destination_index: str,
+    tmp_path: Path,
+):
+    file_data = FileData(
+        source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
+        connector_type=CONNECTOR_TYPE,
+        identifier="mock file data",
+    )
+    connection_config = ElasticsearchConnectionConfig(
+        access_config=ElasticsearchAccessConfig(password=ES_PASSWORD),
+        username=ES_USERNAME,
+        hosts=["http://localhost:9200"],
+    )
+    stager = ElasticsearchUploadStager(
+        upload_stager_config=ElasticsearchUploadStagerConfig(index_name=destination_index)
+    )
+
+    uploader = ElasticsearchUploader(
+        connection_config=connection_config,
+        upload_config=ElasticsearchUploaderConfig(index_name=destination_index),
+    )
+    staged_filepath = stager.run(
+        elements_filepath=upload_file,
+        file_data=file_data,
+        output_dir=tmp_path,
+        output_filename=upload_file.name,
+    )
+    uploader.precheck()
+    uploader.run(path=staged_filepath, file_data=file_data)
+
+    # Run validation
+    with staged_filepath.open() as f:
+        staged_elements = json.load(f)
+    expected_count = len(staged_elements)
+    with get_client() as client:
+        validate_count(client=client, expected_count=expected_count, index_name=destination_index)
+
+    # Rerun and make sure the same documents get updated
+    uploader.run(path=staged_filepath, file_data=file_data)
+    with get_client() as client:
+        validate_count(client=client, expected_count=expected_count, index_name=destination_index)
diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py
@@ -1 +1 @@
-__version__ = "1.1.2"  # pragma: no cover
+__version__ = "1.1.3"  # pragma: no cover
diff --git a/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py b/unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py
@@ -19,7 +19,6 @@
     DestinationConnectionError,
     SourceConnectionError,
     SourceConnectionNetworkError,
-    WriteError,
 )
 from unstructured_ingest.interfaces import (
     AccessConfig,
@@ -336,6 +335,8 @@ class ElasticsearchUploadStager(UploadStager):
 
     def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
         data = element_dict.copy()
+        # when _op_type is not specified, it defaults to "index": 
+        # Overwrites if exists, creates if not.
         resp = {
             "_index": self.upload_stager_config.index_name,
             "_id": get_enhanced_element_id(element_dict=data, file_data=file_data),
@@ -397,23 +398,6 @@ def load_parallel_bulk(self):
 
         return parallel_bulk
 
-    def delete_by_record_id(self, client, file_data: FileData) -> None:
-        logger.debug(
-            f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
-            f"from {self.upload_config.index_name} index"
-        )
-        delete_resp = client.delete_by_query(
-            index=self.upload_config.index_name,
-            body={"query": {"match": {self.upload_config.record_id_key: file_data.identifier}}},
-        )
-        logger.info(
-            "deleted {} records from index {}".format(
-                delete_resp["deleted"], self.upload_config.index_name
-            )
-        )
-        if failures := delete_resp.get("failures"):
-            raise WriteError(f"failed to delete records: {failures}")
-
     @requires_dependencies(["elasticsearch"], extras="elasticsearch")
     def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:  # noqa: E501
         from elasticsearch.helpers.errors import BulkIndexError
@@ -429,7 +413,6 @@ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None
         )
 
         with self.connection_config.get_client() as client:
-            self.delete_by_record_id(client=client, file_data=file_data)
             if not client.indices.exists(index=self.upload_config.index_name):
                 logger.warning(
                     f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: "
@@ -446,6 +429,10 @@ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None
                         thread_count=self.upload_config.num_threads,
                     )
                     collections.deque(iterator, maxlen=0)
+                    logger.info(
+                        f"uploaded batch of {len(batch)} elements to index "
+                        f"{self.upload_config.index_name}"
+                    )
                 except BulkIndexError as e:
                     sanitized_errors = [
                         self._sanitize_bulk_index_error(error) for error in e.errors

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "1.1.2" # pragma: no cover`
	`1`	`+__version__ = "1.1.3" # pragma: no cover`