Merge remote-tracking branch 'origin/1.1.x' into add-cli-pathing-application

ctrl-schaff · ctrl-schaff · commit 637eecb9098e · 2025-11-06T16:17:55.000-06:00
diff --git a/biothings/hub/dataindex/indexer_task.py b/biothings/hub/dataindex/indexer_task.py
@@ -9,6 +9,7 @@
 
 from biothings.utils.es import ESIndex as BaseESIndex
 from biothings.utils.loggers import get_logger
+from biothings.utils.serializer import to_json
 
 try:
     from biothings.utils.mongo import doc_feeder
@@ -92,8 +93,12 @@ def _action(doc):
                 self.logger.error(error)
                 self.logger.error("Document ID %s failed: %s", document_id, reason)
 
-            self.logger.warning("Discovered errors during the bulk index task. Defaulting to 0 indexed documents")
-            return 0
+            serialized_errors = to_json(errors, indent=True)
+            message = (
+                f"Bulk indexing failed for index '{self.index_name}'. "
+                f"Elasticsearch responded with errors:\n{serialized_errors}"
+            )
+            raise helpers.BulkIndexError(message, errors) from e
 
     # NOTE
     # Why doesn't "mget", "mexists", "mindex" belong to the base class?
diff --git a/biothings/hub/datainspect/inspector.py b/biothings/hub/datainspect/inspector.py
@@ -10,6 +10,7 @@
 from biothings.hub import INSPECTOR_CATEGORY
 from biothings.hub.databuild.backend import create_backend
 from biothings.hub.datainspect.doc_inspect import (
+    clean_big_nums,
     compute_metadata,
     flatten_and_validate,
     get_converters,
diff --git a/biothings/utils/common.py b/biothings/utils/common.py
@@ -21,6 +21,7 @@
 import random
 import string
 import sys
+import tarfile
 import time
 import types
 import urllib.parse
@@ -160,7 +161,7 @@ def safewfile(filename, prompt=True, default="C", mode="w"):
 
 def anyfile(infile, mode="r"):
     """
-    return a file handler with the support for gzip/zip comppressed files.
+    return a file handler with the support for gzip/zip compressed files.
     if infile is a two value tuple, then first one is the compressed file;
     the second one is the actual filename in the compressed file.
     e.g., ('a.zip', 'aa.txt')
@@ -171,6 +172,25 @@ def anyfile(infile, mode="r"):
     else:
         rawfile = os.path.splitext(infile)[0]
     filetype = os.path.splitext(infile)[1].lower()
+
+
+    # use tarfile built-in method to check for tar file before anything else
+    if tarfile.is_tarfile(infile):
+        tar_file = tarfile.open(infile, mode)
+        try:
+            extracted = tar_file.extractfile(rawfile)
+        except KeyError:
+            # provided rawfile does not appear in the tarball
+            tar_file.close()
+            raise Exception("target member does not contain the provided tar file.")
+
+        # extracted member is not a regular file or link
+        if extracted is None:
+            tar_file.close()
+            raise Exception("invalid target file: must be a regular file or a link")
+
+        return io.TextIOWrapper(extracted)
+
     if filetype == ".gz":
         # import gzip
         in_f = io.TextIOWrapper(gzip.GzipFile(infile, mode))
diff --git a/biothings/web/query/engine.py b/biothings/web/query/engine.py
@@ -22,12 +22,15 @@
 """
 
 import asyncio
+import logging
 
 from elasticsearch import NotFoundError, RequestError
 from elasticsearch.dsl import MultiSearch, Search
 
 from biothings.web.query.builder import ESScrollID
 
+logger = logging.getLogger(__name__)
+
 
 class ResultInterrupt(Exception):
     def __init__(self, data):
@@ -139,6 +142,13 @@ async def execute(self, query, **options):
                     raise RawResultInterrupt(res)
 
                 if not res["hits"]["hits"]:
+                    scroll_id=query.data
+                    try:
+                        await self.client.clear_scroll(scroll_id=scroll_id)
+                        logger.info("Scroll context cleared: %s", scroll_id)
+                    except NotFoundError as e:
+                        logger.warning("Scroll context not found (ID: %s): %s", scroll_id, str(e))
+                    # Always raise this exception regardless of whether clear_scroll succeeds
                     raise EndScrollInterrupt()
 
                 return res
diff --git a/biothings/web/query/formatter.py b/biothings/web/query/formatter.py
@@ -84,13 +84,26 @@ class ESResultFormatter(ResultFormatter):
     class _Hits(Hits):
         def __init__(self, *args, **kwargs):
             super().__init__(*args, **kwargs)
+            # Check if this is an error response from Elasticsearch
+            if "error" in self.data:
+                logger.error("ES returned error response: %s", self.data)
+                raise ValueError("Invalid response format")
+
             # make sure the document is coming from
             # elasticsearch at initialization time
-            assert "hits" in self.data
-            assert "total" in self.data["hits"]
-            assert "hits" in self.data["hits"]
+            if "hits" not in self.data:
+                logger.error("ES response missing 'hits' field. Response data: %s", self.data)
+                raise ValueError("Response missing 'hits' field")
+            if "total" not in self.data["hits"]:
+                logger.error("ES response missing 'hits.total' field. Response data: %s", self.data)
+                raise ValueError("Response missing 'hits.total' field")
+            if "hits" not in self.data["hits"]:
+                logger.error("ES response missing 'hits.hits' field. Response data: %s", self.data)
+                raise ValueError("Response missing 'hits.hits' field")
             for hit in self.data["hits"]["hits"]:
-                assert "_source" in hit
+                if "_source" not in hit:
+                    logger.error("ES hit missing '_source' field. Hit data: %s", hit)
+                    raise ValueError("Hit missing '_source' field")
 
     class _Doc(Doc):
         pass
diff --git a/biothings/web/query/pipeline.py b/biothings/web/query/pipeline.py
@@ -147,6 +147,10 @@ async def _(*args, **kwargs):
                 elif error_type == "index_not_found_exception":
                     raise QueryPipelineException(500, error_type)
 
+                elif error_type == "es_rejected_execution_exception":
+                    # ES cluster is overloaded, all thread pools at capacity
+                    raise QueryPipelineException(503, "Service Unavailable", "Elasticsearch cluster overloaded")
+
                 else:  # unexpected
                     raise
 
diff --git a/tests/web/test_es_exceptions.py b/tests/web/test_es_exceptions.py
@@ -160,10 +160,10 @@ async def func():
 
 
 @pytest.mark.asyncio
-async def test_generic_exception():
+async def test_index_not_found_exception():
     @capturesESExceptions
     async def func():
-        exc = Exception(message="test_generic_exception", meta={}, body={})
+        exc = Exception(message="test_index_not_found_exception", meta={}, body={})
         exc.status_code = 500
         exc.info = {"error": {"type": "index_not_found_exception", "reason": "test_reason"}}
         raise exc
@@ -175,6 +175,22 @@ async def func():
     assert exc_info.value.details == "Exception() takes no keyword arguments"
 
 
+@pytest.mark.asyncio
+async def test_es_rejected_execution_exception():
+    @capturesESExceptions
+    async def func():
+        exc = TransportError("test_es_rejected_execution_exception")
+        exc.status_code = 503
+        exc.info = {"error": {"type": "es_rejected_execution_exception", "reason": "rejected execution of TimedRunnable..."}}
+        raise exc
+
+    with pytest.raises(QueryPipelineException) as exc_info:
+        await func()
+    assert exc_info.value.code == 503
+    assert exc_info.value.summary == "Service Unavailable"
+    assert exc_info.value.details == "Elasticsearch cluster overloaded"
+
+
 @pytest.mark.asyncio
 async def test_search_phase_execution_exception_rejected_execution():
     @capturesESExceptions