diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 457ead64212..fa6c7e5b838 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -465,7 +465,7 @@ def _do_annotations(self, span: Span) -> None:
         with self._annotation_context_lock:
             for _, context_id, annotation_kwargs in self._instance._annotations:
                 if current_context_id == context_id:
-                    self.annotate(span, **annotation_kwargs)
+                    self.annotate(span, **annotation_kwargs, _suppress_span_kind_error=True)
 
     def _child_after_fork(self) -> None:
         self._llmobs_span_writer = self._llmobs_span_writer.recreate()
@@ -505,7 +505,7 @@ def _stop_service(self) -> None:
         core.reset_listeners("trace.span_start", self._on_span_start)
         core.reset_listeners("trace.span_finish", self._on_span_finish)
         core.reset_listeners("http.span_inject", self._inject_llmobs_context)
-        core.reset_listeners("http.activate_distributed_headers", self._activate_llmobs_distributed_context)
+        core.reset_listeners("http.activate_distributed_headers", self._activate_llmobs_distributed_context_soft_fail)
         core.reset_listeners("threading.submit", self._current_trace_context)
         core.reset_listeners("threading.execution", self._llmobs_context_provider.activate)
         core.reset_listeners("asyncio.create_task", self._on_asyncio_create_task)
@@ -620,7 +620,7 @@ def enable(
             core.on("trace.span_start", cls._instance._on_span_start)
             core.on("trace.span_finish", cls._instance._on_span_finish)
             core.on("http.span_inject", cls._inject_llmobs_context)
-            core.on("http.activate_distributed_headers", cls._activate_llmobs_distributed_context)
+            core.on("http.activate_distributed_headers", cls._activate_llmobs_distributed_context_soft_fail)
             core.on("threading.submit", cls._instance._current_trace_context, "llmobs_ctx")
             core.on("threading.execution", cls._instance._llmobs_context_provider.activate)
             core.on("asyncio.create_task", cls._instance._on_asyncio_create_task)
@@ -1014,16 +1014,14 @@ def export_span(cls, span: Optional[Span] = None) -> Optional[ExportedLLMObsSpan
         try:
             if span.span_type != SpanTypes.LLM:
                 error = "invalid_span"
-                log.warning("Span must be an LLMObs-generated span.")
-                return None
+                raise Exception("Span must be an LLMObs-generated span.")
             return ExportedLLMObsSpan(
                 span_id=str(span.span_id),
                 trace_id=format_trace_id(span._get_ctx_item(LLMOBS_TRACE_ID) or span.trace_id),
             )
         except (TypeError, AttributeError):
             error = "invalid_span"
-            log.warning("Failed to export span. Span must be a valid Span object.")
-            return None
+            raise Exception("Failed to export span. Span must be a valid Span object.") from None
         finally:
             telemetry.record_span_exported(span, error)
 
@@ -1338,6 +1336,7 @@ def annotate(
         tags: Optional[Dict[str, Any]] = None,
         tool_definitions: Optional[List[Dict[str, Any]]] = None,
         _name: Optional[str] = None,
+        _suppress_span_kind_error: bool = False,
     ) -> None:
         """
         Sets metadata, inputs, outputs, tags, and metrics as provided for a given LLMObs span.
@@ -1397,32 +1396,29 @@ def annotate(
                 span = cls._instance._current_span()
                 if span is None:
                     error = "invalid_span_no_active_spans"
-                    log.warning("No span provided and no active LLMObs-generated span found.")
-                    return
+                    raise Exception("No span provided and no active LLMObs-generated span found.")
             if span.span_type != SpanTypes.LLM:
                 error = "invalid_span_type"
-                log.warning("Span must be an LLMObs-generated span.")
-                return
+                raise Exception("Span must be an LLMObs-generated span.")
             if span.finished:
                 error = "invalid_finished_span"
-                log.warning("Cannot annotate a finished span.")
-                return
+                raise Exception("Cannot annotate a finished span.")
             if metadata is not None:
                 if not isinstance(metadata, dict):
                     error = "invalid_metadata"
-                    log.warning("metadata must be a dictionary")
+                    raise Exception("metadata must be a dictionary")
                 else:
                     cls._set_dict_attribute(span, METADATA, metadata)
             if metrics is not None:
                 if not isinstance(metrics, dict) or not all(isinstance(v, (int, float)) for v in metrics.values()):
                     error = "invalid_metrics"
-                    log.warning("metrics must be a dictionary of string key - numeric value pairs.")
+                    raise Exception("metrics must be a dictionary of string key - numeric value pairs.")
                 else:
                     cls._set_dict_attribute(span, METRICS, metrics)
             if tags is not None:
                 if not isinstance(tags, dict):
                     error = "invalid_tags"
-                    log.warning("span tags must be a dictionary of string key - primitive value pairs.")
+                    raise Exception("span tags must be a dictionary of string key - primitive value pairs.")
                 else:
                     session_id = tags.get("session_id")
                     if session_id:
@@ -1441,10 +1437,11 @@ def annotate(
                     cls._set_dict_attribute(span, INPUT_PROMPT, validated_prompt)
                 except (ValueError, TypeError) as e:
                     error = "invalid_prompt"
-                    log.warning("Failed to validate prompt with error:", str(e), exc_info=True)
-            if not span_kind:
-                log.debug("Span kind not specified, skipping annotation for input/output data")
-                return
+                    raise Exception("Failed to validate prompt with error:", str(e))
+            if (
+                not span_kind and not _suppress_span_kind_error
+            ):  # TODO(sabrenner): we should figure out how to remove this check for annotation contexts
+                raise Exception("Span kind not specified, skipping annotation for input/output data")
             if input_data is not None or output_data is not None:
                 if span_kind == "llm":
                     error = cls._tag_llm_io(span, input_messages=input_data, output_messages=output_data)
@@ -1471,7 +1468,9 @@ def _tag_llm_io(cls, span, input_messages=None, output_messages=None) -> Optiona
                 if input_messages.messages:
                     span._set_ctx_item(INPUT_MESSAGES, input_messages.messages)
             except TypeError:
-                log.warning("Failed to parse input messages.", exc_info=True)
+                log.warning(
+                    "Failed to parse input messages.", exc_info=True
+                )  # TODO: figure out how to raise this error and return the error type
                 return "invalid_io_messages"
         if output_messages is None:
             return None
@@ -1482,7 +1481,9 @@ def _tag_llm_io(cls, span, input_messages=None, output_messages=None) -> Optiona
                 return None
             span._set_ctx_item(OUTPUT_MESSAGES, output_messages.messages)
         except TypeError:
-            log.warning("Failed to parse output messages.", exc_info=True)
+            log.warning(
+                "Failed to parse output messages.", exc_info=True
+            )  # TODO: figure out how to raise this error and return the error type
             return "invalid_io_messages"
         return None
 
@@ -1498,7 +1499,9 @@ def _tag_embedding_io(cls, span, input_documents=None, output_text=None) -> Opti
                 if input_documents.documents:
                     span._set_ctx_item(INPUT_DOCUMENTS, input_documents.documents)
             except TypeError:
-                log.warning("Failed to parse input documents.", exc_info=True)
+                log.warning(
+                    "Failed to parse input documents.", exc_info=True
+                )  # TODO: figure out how to raise this error and return the error type
                 return "invalid_embedding_io"
         if output_text is None:
             return None
@@ -1521,7 +1524,9 @@ def _tag_retrieval_io(cls, span, input_text=None, output_documents=None) -> Opti
                 return None
             span._set_ctx_item(OUTPUT_DOCUMENTS, output_documents.documents)
         except TypeError:
-            log.warning("Failed to parse output documents.", exc_info=True)
+            log.warning(
+                "Failed to parse output documents.", exc_info=True
+            )  # TODO: figure out how to raise this error and return the error type
             return "invalid_retrieval_io"
         return None
 
@@ -1712,17 +1717,15 @@ def submit_evaluation(
                 raise TypeError("value must be a boolean for a boolean metric.")
 
             if tags is not None and not isinstance(tags, dict):
-                log.warning("tags must be a dictionary of string key-value pairs.")
-                tags = {}
+                raise Exception("tags must be a dictionary of string key-value pairs.")
 
             ml_app = ml_app if ml_app else config._llmobs_ml_app
             if not ml_app:
                 error = "missing_ml_app"
-                log.warning(
+                raise Exception(
                     "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
                     "Ensure this configuration is set before running your application."
                 )
-                return
 
             evaluation_tags = {
                 "ddtrace.version": ddtrace.__version__,
@@ -1735,7 +1738,7 @@ def submit_evaluation(
                         evaluation_tags[ensure_text(k)] = ensure_text(v)
                     except TypeError:
                         error = "invalid_tags"
-                        log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
+                        raise Exception("Failed to parse tags. Tags for evaluation metrics must be strings.")
 
             evaluation_metric: LLMObsEvaluationMetricEvent = {
                 "join_on": join_on,
@@ -1750,20 +1753,20 @@ def submit_evaluation(
             if assessment:
                 if not isinstance(assessment, str) or assessment not in ("pass", "fail"):
                     error = "invalid_assessment"
-                    log.warning("Failed to parse assessment. assessment must be either 'pass' or 'fail'.")
+                    raise Exception("Failed to parse assessment. assessment must be either 'pass' or 'fail'.")
                 else:
                     evaluation_metric["assessment"] = assessment
             if reasoning:
                 if not isinstance(reasoning, str):
                     error = "invalid_reasoning"
-                    log.warning("Failed to parse reasoning. reasoning must be a string.")
+                    raise Exception("Failed to parse reasoning. reasoning must be a string.")
                 else:
                     evaluation_metric["reasoning"] = reasoning
 
             if metadata:
                 if not isinstance(metadata, dict):
                     error = "invalid_metadata"
-                    log.warning("metadata must be json serializable dictionary.")
+                    raise Exception("metadata must be json serializable dictionary.")
                 else:
                     metadata = safe_json(metadata)
                     if metadata and isinstance(metadata, str):
@@ -1801,7 +1804,9 @@ def _inject_llmobs_context(cls, span_context: Context, request_headers: Dict[str
             span_context._meta[PROPAGATED_ML_APP_KEY] = ml_app
 
     @classmethod
-    def inject_distributed_headers(cls, request_headers: Dict[str, str], span: Optional[Span] = None) -> Dict[str, str]:
+    def inject_distributed_headers(
+        cls, request_headers: Dict[str, str], span: Optional[Span] = None, _soft_fail: bool = False
+    ) -> Dict[str, str]:
         """Injects the span's distributed context into the given request headers."""
         if cls.enabled is False:
             log.warning(
@@ -1813,53 +1818,74 @@ def inject_distributed_headers(cls, request_headers: Dict[str, str], span: Optio
         try:
             if not isinstance(request_headers, dict):
                 error = "invalid_request_headers"
-                log.warning("request_headers must be a dictionary of string key-value pairs.")
-                return request_headers
+                if _soft_fail:
+                    log.warning("request_headers must be a dictionary of string key-value pairs.")
+                    return request_headers
+                else:
+                    raise Exception("request_headers must be a dictionary of string key-value pairs.")
             if span is None:
                 span = cls._instance.tracer.current_span()
             if span is None:
                 error = "no_active_span"
-                log.warning("No span provided and no currently active span found.")
-                return request_headers
+                if _soft_fail:
+                    log.warning("No span provided and no currently active span found.")
+                    return request_headers
+                raise Exception("No span provided and no currently active span found.")
             if not isinstance(span, Span):
                 error = "invalid_span"
-                log.warning("span must be a valid Span object. Distributed context will not be injected.")
-                return request_headers
+                if _soft_fail:
+                    log.warning("span must be a valid Span object. Distributed context will not be injected.")
+                    return request_headers
+                raise Exception("span must be a valid Span object. Distributed context will not be injected.")
             HTTPPropagator.inject(span.context, request_headers)
             return request_headers
         finally:
             telemetry.record_inject_distributed_headers(error)
 
     @classmethod
-    def _activate_llmobs_distributed_context(cls, request_headers: Dict[str, str], context: Context) -> Optional[str]:
-        if cls.enabled is False:
-            return None
-        if not context.trace_id or not context.span_id:
-            log.warning("Failed to extract trace/span ID from request headers.")
-            return "missing_context"
-        _parent_id = context._meta.get(PROPAGATED_PARENT_ID_KEY)
-        if _parent_id is None:
-            log.debug("Failed to extract LLMObs parent ID from request headers.")
-            return "missing_parent_id"
+    def _activate_llmobs_distributed_context_soft_fail(cls, request_headers: Dict[str, str], context: Context) -> None:
+        cls._activate_llmobs_distributed_context(request_headers, context, _soft_fail=True)
+
+    @classmethod
+    def _activate_llmobs_distributed_context(
+        cls, request_headers: Dict[str, str], context: Context, _soft_fail: bool = False
+    ) -> None:
+        error = None
         try:
-            parent_id = int(_parent_id)
-        except ValueError:
-            log.warning("Failed to parse LLMObs parent ID from request headers.")
-            return "invalid_parent_id"
-        parent_llmobs_trace_id = context._meta.get(PROPAGATED_LLMOBS_TRACE_ID_KEY)
-        if parent_llmobs_trace_id is None:
-            log.debug("Failed to extract LLMObs trace ID from request headers. Expected string, got None.")
+            if cls.enabled is False:
+                return
+            if not context.trace_id or not context.span_id:
+                error = "missing_context"
+                if _soft_fail:
+                    log.warning("Failed to extract trace/span ID from request headers.")
+                    return
+                raise Exception("Failed to extract trace/span ID from request headers.")
+            _parent_id = context._meta.get(PROPAGATED_PARENT_ID_KEY)
+            if _parent_id is None:
+                error = "missing_parent_id"
+                log.debug("Failed to extract LLMObs parent ID from request headers.")
+                return
+            try:
+                parent_id = int(_parent_id)
+            except ValueError:
+                error = "invalid_parent_id"
+                log.warning("Failed to parse LLMObs parent ID from request headers.")
+                return
+            parent_llmobs_trace_id = context._meta.get(PROPAGATED_LLMOBS_TRACE_ID_KEY)
+            if parent_llmobs_trace_id is None:
+                log.debug("Failed to extract LLMObs trace ID from request headers. Expected string, got None.")
+                llmobs_context = Context(trace_id=context.trace_id, span_id=parent_id)
+                llmobs_context._meta[PROPAGATED_LLMOBS_TRACE_ID_KEY] = str(context.trace_id)
+                cls._instance._llmobs_context_provider.activate(llmobs_context)
+                error = "missing_parent_llmobs_trace_id"
             llmobs_context = Context(trace_id=context.trace_id, span_id=parent_id)
-            llmobs_context._meta[PROPAGATED_LLMOBS_TRACE_ID_KEY] = str(context.trace_id)
+            llmobs_context._meta[PROPAGATED_LLMOBS_TRACE_ID_KEY] = str(parent_llmobs_trace_id)
             cls._instance._llmobs_context_provider.activate(llmobs_context)
-            return "missing_parent_llmobs_trace_id"
-        llmobs_context = Context(trace_id=context.trace_id, span_id=parent_id)
-        llmobs_context._meta[PROPAGATED_LLMOBS_TRACE_ID_KEY] = str(parent_llmobs_trace_id)
-        cls._instance._llmobs_context_provider.activate(llmobs_context)
-        return None
+        finally:
+            telemetry.record_activate_distributed_headers(error)
 
     @classmethod
-    def activate_distributed_headers(cls, request_headers: Dict[str, str]) -> None:
+    def activate_distributed_headers(cls, request_headers: Dict[str, str], _soft_fail: bool = False) -> None:
         """
         Activates distributed tracing headers for the current request.
 
@@ -1873,8 +1899,7 @@ def activate_distributed_headers(cls, request_headers: Dict[str, str]) -> None:
             return
         context = HTTPPropagator.extract(request_headers)
         cls._instance.tracer.context_provider.activate(context)
-        error = cls._instance._activate_llmobs_distributed_context(request_headers, context)
-        telemetry.record_activate_distributed_headers(error)
+        cls._instance._activate_llmobs_distributed_context(request_headers, context, _soft_fail=_soft_fail)
 
 
 # initialize the default llmobs instance
diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py
index 5a0c233002b..8a60c2d4416 100644
--- a/tests/llmobs/test_llmobs_service.py
+++ b/tests/llmobs/test_llmobs_service.py
@@ -26,6 +26,7 @@
 from ddtrace.llmobs._constants import OUTPUT_MESSAGES
 from ddtrace.llmobs._constants import OUTPUT_VALUE
 from ddtrace.llmobs._constants import PROPAGATED_ML_APP_KEY
+from ddtrace.llmobs._constants import PROPAGATED_PARENT_ID_KEY
 from ddtrace.llmobs._constants import SESSION_ID
 from ddtrace.llmobs._constants import SPAN_KIND
 from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING
@@ -424,23 +425,26 @@ def test_embedding_span(llmobs, llmobs_events):
     )
 
 
-def test_annotate_no_active_span_logs_warning(llmobs, mock_llmobs_logs):
-    llmobs.annotate(metadata={"test": "test"})
-    mock_llmobs_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.")
+def test_annotate_no_active_span_logs_warning(llmobs):
+    with pytest.raises(Exception) as excinfo:
+        llmobs.annotate(metadata={"test": "test"})
+    assert str(excinfo.value) == "No span provided and no active LLMObs-generated span found."
 
 
-def test_annotate_non_llm_span_logs_warning(llmobs, mock_llmobs_logs):
+def test_annotate_non_llm_span_logs_warning(llmobs):
     dummy_tracer = DummyTracer()
     with dummy_tracer.trace("root") as non_llmobs_span:
-        llmobs.annotate(span=non_llmobs_span, metadata={"test": "test"})
-        mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
+        with pytest.raises(Exception) as excinfo:
+            llmobs.annotate(span=non_llmobs_span, metadata={"test": "test"})
+        assert str(excinfo.value) == "Span must be an LLMObs-generated span."
 
 
-def test_annotate_finished_span_does_nothing(llmobs, mock_llmobs_logs):
+def test_annotate_finished_span_does_nothing(llmobs):
     with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
         pass
-    llmobs.annotate(span=span, metadata={"test": "test"})
-    mock_llmobs_logs.warning.assert_called_once_with("Cannot annotate a finished span.")
+    with pytest.raises(Exception) as excinfo:
+        llmobs.annotate(span=span, metadata={"test": "test"})
+    assert str(excinfo.value) == "Cannot annotate a finished span."
 
 
 def test_annotate_metadata(llmobs):
@@ -462,12 +466,11 @@ def test_annotate_metadata_updates(llmobs):
         }
 
 
-def test_annotate_metadata_wrong_type_raises_warning(llmobs, mock_llmobs_logs):
+def test_annotate_metadata_wrong_type_raises_warning(llmobs):
     with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        llmobs.annotate(span=span, metadata="wrong_metadata")
-        assert span._get_ctx_item(METADATA) is None
-        mock_llmobs_logs.warning.assert_called_once_with("metadata must be a dictionary")
-        mock_llmobs_logs.reset_mock()
+        with pytest.raises(Exception) as excinfo:
+            llmobs.annotate(span=span, metadata="wrong_metadata")
+        assert str(excinfo.value) == "metadata must be a dictionary"
 
 
 def test_annotate_tag(llmobs):
@@ -483,13 +486,11 @@ def test_annotate_tag_can_set_session_id(llmobs):
         assert span._get_ctx_item(SESSION_ID) == "1234567890"
 
 
-def test_annotate_tag_wrong_type(llmobs, mock_llmobs_logs):
+def test_annotate_tag_wrong_type(llmobs):
     with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span:
-        llmobs.annotate(span=span, tags=12345)
-        assert span._get_ctx_item(TAGS) is None
-        mock_llmobs_logs.warning.assert_called_once_with(
-            "span tags must be a dictionary of string key - primitive value pairs."
-        )
+        with pytest.raises(Exception) as excinfo:
+            llmobs.annotate(span=span, tags=12345)
+        assert str(excinfo.value) == "span tags must be a dictionary of string key - primitive value pairs."
 
 
 def test_annotate_input_string(llmobs):
@@ -758,14 +759,11 @@ def test_annotate_metrics_updates(llmobs):
         assert span._get_ctx_item(METRICS) == {"input_tokens": 20, "output_tokens": 20, "total_tokens": 40}
 
 
-def test_annotate_metrics_wrong_type(llmobs, mock_llmobs_logs):
+def test_annotate_metrics_wrong_type(llmobs):
     with llmobs.llm(model_name="test_model") as llm_span:
-        llmobs.annotate(span=llm_span, metrics=12345)
-        assert llm_span._get_ctx_item(METRICS) is None
-        mock_llmobs_logs.warning.assert_called_once_with(
-            "metrics must be a dictionary of string key - numeric value pairs."
-        )
-        mock_llmobs_logs.reset_mock()
+        with pytest.raises(Exception) as excinfo:
+            llmobs.annotate(span=llm_span, metrics=12345)
+        assert str(excinfo.value) == "metrics must be a dictionary of string key - numeric value pairs."
 
 
 def test_annotate_prompt_dict(llmobs):
@@ -835,20 +833,21 @@ def test_annotate_prompt_typed_dict(llmobs):
         }
 
 
-def test_annotate_prompt_wrong_type(llmobs, mock_llmobs_logs):
+def test_annotate_prompt_wrong_type(llmobs):
     with llmobs.llm(model_name="test_model") as span:
-        llmobs.annotate(span=span, prompt="prompt")
-        assert span._get_ctx_item(INPUT_PROMPT) is None
-        mock_llmobs_logs.warning.assert_called_once_with(
-            "Failed to validate prompt with error:", "Prompt must be a dictionary, received str.", exc_info=True
+        with pytest.raises(Exception) as excinfo:
+            llmobs.annotate(span=span, prompt="prompt")
+        assert excinfo.value.args == (
+            "Failed to validate prompt with error:",
+            "Prompt must be a dictionary, received str.",
         )
-        mock_llmobs_logs.reset_mock()
 
-        llmobs.annotate(span=span, prompt={"template": 1})
-        mock_llmobs_logs.warning.assert_called_once_with(
-            "Failed to validate prompt with error:", "template: 1 must be a string, received int", exc_info=True
+        with pytest.raises(Exception) as excinfo:
+            llmobs.annotate(span=span, prompt={"template": 1})
+        assert excinfo.value.args == (
+            "Failed to validate prompt with error:",
+            "template: 1 must be a string, received int",
         )
-        mock_llmobs_logs.reset_mock()
 
 
 def test_span_error_sets_error(llmobs, llmobs_events):
@@ -916,15 +915,17 @@ def test_ml_app_override(llmobs, llmobs_events):
     assert llmobs_events[6] == _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"})
 
 
-def test_export_span_specified_span_is_incorrect_type_raises_warning(llmobs, mock_llmobs_logs):
-    llmobs.export_span(span="asd")
-    mock_llmobs_logs.warning.assert_called_once_with("Failed to export span. Span must be a valid Span object.")
+def test_export_span_specified_span_is_incorrect_type_raises_warning(llmobs):
+    with pytest.raises(Exception) as excinfo:
+        llmobs.export_span(span="asd")
+    assert str(excinfo.value) == "Failed to export span. Span must be a valid Span object."
 
 
-def test_export_span_specified_span_is_not_llmobs_span_raises_warning(llmobs, mock_llmobs_logs):
+def test_export_span_specified_span_is_not_llmobs_span_raises_warning(llmobs):
     with DummyTracer().trace("non_llmobs_span") as span:
-        llmobs.export_span(span=span)
-    mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.")
+        with pytest.raises(Exception) as excinfo:
+            llmobs.export_span(span=span)
+        assert str(excinfo.value) == "Span must be an LLMObs-generated span."
 
 
 def test_export_span_specified_span_returns_span_context(llmobs):
@@ -980,24 +981,17 @@ def test_inject_distributed_headers_llmobs_disabled_does_nothing(llmobs, mock_ll
     assert headers == {}
 
 
-def test_inject_distributed_headers_not_dict_logs_warning(llmobs, mock_llmobs_logs):
-    headers = llmobs.inject_distributed_headers("not a dictionary", span=None)
-    mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
-    assert headers == "not a dictionary"
-    mock_llmobs_logs.reset_mock()
-    headers = llmobs.inject_distributed_headers(123, span=None)
-    mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
-    assert headers == 123
-    mock_llmobs_logs.reset_mock()
-    headers = llmobs.inject_distributed_headers(None, span=None)
-    mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.")
-    assert headers is None
+@pytest.mark.parametrize("request_headers", ["not a dictionary", 123, None])
+def test_inject_distributed_headers_not_dict_logs_warning(llmobs, request_headers):
+    with pytest.raises(Exception) as excinfo:
+        llmobs.inject_distributed_headers(request_headers, span=None)
+    assert str(excinfo.value) == "request_headers must be a dictionary of string key-value pairs."
 
 
-def test_inject_distributed_headers_no_active_span_logs_warning(llmobs, mock_llmobs_logs):
-    headers = llmobs.inject_distributed_headers({}, span=None)
-    mock_llmobs_logs.warning.assert_called_once_with("No span provided and no currently active span found.")
-    assert headers == {}
+def test_inject_distributed_headers_no_active_span_logs_warning(llmobs):
+    with pytest.raises(Exception) as excinfo:
+        llmobs.inject_distributed_headers({}, span=None)
+    assert str(excinfo.value) == "No span provided and no currently active span found."
 
 
 def test_inject_distributed_headers_span_calls_httppropagator_inject(llmobs, mock_llmobs_logs):
@@ -1032,36 +1026,29 @@ def test_activate_distributed_headers_calls_httppropagator_extract(llmobs, mock_
         mock_extract.assert_called_once_with({})
 
 
-def test_activate_distributed_headers_no_trace_id_does_nothing(llmobs, mock_llmobs_logs):
-    with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
-        mock_extract.return_value = Context(span_id=123)
+def test_activate_distributed_headers_no_trace_id_raises(llmobs):
+    with pytest.raises(Exception) as excinfo:
         llmobs.activate_distributed_headers({})
-        assert mock_extract.call_count == 1
-        mock_llmobs_logs.warning.assert_called_once_with("Failed to extract trace/span ID from request headers.")
+    assert str(excinfo.value) == "Failed to extract trace/span ID from request headers."
 
 
-def test_activate_distributed_headers_no_span_id_does_nothing(llmobs, mock_llmobs_logs):
-    with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
-        mock_extract.return_value = Context(trace_id=123)
+def test_activate_distributed_headers_no_span_id_raises(llmobs):
+    with pytest.raises(Exception) as excinfo:
         llmobs.activate_distributed_headers({})
-        assert mock_extract.call_count == 1
-        mock_llmobs_logs.warning.assert_called_once_with("Failed to extract trace/span ID from request headers.")
+    assert str(excinfo.value) == "Failed to extract trace/span ID from request headers."
 
 
 def test_activate_distributed_headers_no_llmobs_parent_id_does_nothing(llmobs, mock_llmobs_logs):
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
         dummy_context = Context(trace_id=123, span_id=456)
         mock_extract.return_value = dummy_context
-        with mock.patch("ddtrace.llmobs.LLMObs._instance.tracer.context_provider.activate") as mock_activate:
-            llmobs.activate_distributed_headers({})
-            assert mock_extract.call_count == 1
-            mock_llmobs_logs.debug.assert_called_once_with("Failed to extract LLMObs parent ID from request headers.")
-            mock_activate.assert_called_once_with(dummy_context)
+        llmobs.activate_distributed_headers({})
+        mock_llmobs_logs.debug.assert_called_once_with("Failed to extract LLMObs parent ID from request headers.")
 
 
-def test_activate_distributed_headers_activates_context(llmobs, mock_llmobs_logs):
+def test_activate_distributed_headers_activates_context(llmobs):
     with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract:
-        dummy_context = Context(trace_id=123, span_id=456)
+        dummy_context = Context(trace_id=123, span_id=456, meta={PROPAGATED_PARENT_ID_KEY: "123"})
         mock_extract.return_value = dummy_context
         with mock.patch("ddtrace.llmobs.LLMObs._instance.tracer.context_provider.activate") as mock_activate:
             llmobs.activate_distributed_headers({})
@@ -1537,15 +1524,16 @@ def test_service_enable_does_not_start_evaluator_runner():
         llmobs_service.disable()
 
 
-def test_submit_evaluation_no_ml_app_raises_warning(llmobs, mock_llmobs_logs):
+def test_submit_evaluation_no_ml_app_raises_warning(llmobs):
     with override_global_config(dict(_llmobs_ml_app="")):
-        llmobs.submit_evaluation(
-            span={"span_id": "123", "trace_id": "456"},
-            label="toxicity",
-            metric_type="categorical",
-            value="high",
-        )
-        mock_llmobs_logs.warning.assert_called_once_with(
+        with pytest.raises(Exception) as excinfo:
+            llmobs.submit_evaluation(
+                span={"span_id": "123", "trace_id": "456"},
+                label="toxicity",
+                metric_type="categorical",
+                value="high",
+            )
+        assert str(excinfo.value) == (
             "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
             "Ensure this configuration is set before running your application."
         )
@@ -1653,46 +1641,32 @@ def test_submit_evaluation_incorrect_score_value_type_raises_error(llmobs, mock_
 
 
 def test_submit_evaluation_invalid_tags_raises_warning(llmobs, mock_llmobs_logs):
-    llmobs.submit_evaluation(
-        span={"span_id": "123", "trace_id": "456"},
-        label="toxicity",
-        metric_type="categorical",
-        value="high",
-        tags=["invalid"],
-    )
-    mock_llmobs_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.")
+    with pytest.raises(Exception) as excinfo:
+        llmobs.submit_evaluation(
+            span={"span_id": "123", "trace_id": "456"},
+            label="toxicity",
+            metric_type="categorical",
+            value="high",
+            tags=["invalid"],
+        )
+    assert str(excinfo.value) == "tags must be a dictionary of string key-value pairs."
 
 
 @pytest.mark.parametrize(
     "ddtrace_global_config",
     [dict(_llmobs_ml_app="test_app_name")],
 )
-def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits(
-    llmobs, mock_llmobs_logs, mock_llmobs_eval_metric_writer
-):
-    llmobs.submit_evaluation(
-        span={"span_id": "123", "trace_id": "456"},
-        label="toxicity",
-        metric_type="categorical",
-        value="high",
-        tags={1: 2, "foo": "bar"},
-        ml_app="dummy",
-    )
-    mock_llmobs_logs.warning.assert_called_once_with(
-        "Failed to parse tags. Tags for evaluation metrics must be strings."
-    )
-    mock_llmobs_logs.reset_mock()
-    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
-        _expected_llmobs_eval_metric_event(
-            ml_app="dummy",
-            span_id="123",
-            trace_id="456",
+def test_submit_evaluation_non_string_tags_raises(llmobs):  # TODO(sabrenner): check if we're ok changing this behavior
+    with pytest.raises(Exception) as excinfo:
+        llmobs.submit_evaluation(
+            span={"span_id": "123", "trace_id": "456"},
             label="toxicity",
             metric_type="categorical",
-            categorical_value="high",
-            tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:dummy", "foo:bar"],
+            value="high",
+            tags={1: 2, "foo": "bar"},
+            ml_app="dummy",
         )
-    )
+    assert str(excinfo.value) == "Failed to parse tags. Tags for evaluation metrics must be strings."
 
 
 @pytest.mark.parametrize(
@@ -1834,40 +1808,18 @@ def test_submit_evaluation_metric_with_metadata_enqueues_metric(llmobs, mock_llm
             metadata={"foo": ["bar", "baz"]},
         )
     )
-    mock_llmobs_eval_metric_writer.reset()
-    llmobs.submit_evaluation(
-        span={"span_id": "123", "trace_id": "456"},
-        label="toxicity",
-        metric_type="categorical",
-        value="high",
-        tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"},
-        ml_app="ml_app_override",
-        metadata="invalid",
-    )
-    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
-        _expected_llmobs_eval_metric_event(
-            ml_app="ml_app_override",
-            span_id="123",
-            trace_id="456",
+
+
+def test_submit_evaluation_invalid_assessment_raises(llmobs):
+    with pytest.raises(Exception) as excinfo:
+        llmobs.submit_evaluation(
+            span={"span_id": "123", "trace_id": "456"},
             label="toxicity",
             metric_type="categorical",
-            categorical_value="high",
-            tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
+            value="high",
+            assessment=True,
         )
-    )
-
-
-def test_submit_evaluation_invalid_assessment_raises_warning(llmobs, mock_llmobs_logs):
-    llmobs.submit_evaluation(
-        span={"span_id": "123", "trace_id": "456"},
-        label="toxicity",
-        metric_type="categorical",
-        value="high",
-        assessment=True,
-    )
-    mock_llmobs_logs.warning.assert_called_once_with(
-        "Failed to parse assessment. assessment must be either 'pass' or 'fail'."
-    )
+    assert str(excinfo.value) == "Failed to parse assessment. assessment must be either 'pass' or 'fail'."
 
 
 def test_submit_evaluation_enqueues_writer_with_assessment(llmobs, mock_llmobs_eval_metric_writer):
@@ -1902,7 +1854,7 @@ def test_submit_evaluation_enqueues_writer_with_assessment(llmobs, mock_llmobs_e
         value="high",
         tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"},
         ml_app="ml_app_override",
-        metadata="invalid",
+        metadata={"foo": ["bar", "baz"]},
         assessment="fail",
     )
     mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
@@ -1914,20 +1866,22 @@ def test_submit_evaluation_enqueues_writer_with_assessment(llmobs, mock_llmobs_e
             metric_type="categorical",
             categorical_value="high",
             tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
+            metadata={"foo": ["bar", "baz"]},
             assessment="fail",
         )
     )
 
 
 def test_submit_evaluation_invalid_reasoning_raises_warning(llmobs, mock_llmobs_logs):
-    llmobs.submit_evaluation(
-        span={"span_id": "123", "trace_id": "456"},
-        label="toxicity",
-        metric_type="categorical",
-        value="high",
-        reasoning=123,
-    )
-    mock_llmobs_logs.warning.assert_called_once_with("Failed to parse reasoning. reasoning must be a string.")
+    with pytest.raises(Exception) as excinfo:
+        llmobs.submit_evaluation(
+            span={"span_id": "123", "trace_id": "456"},
+            label="toxicity",
+            metric_type="categorical",
+            value="high",
+            reasoning=123,
+        )
+    assert str(excinfo.value) == "Failed to parse reasoning. reasoning must be a string."
 
 
 def test_submit_evaluation_for_enqueues_writer_with_reasoning(llmobs, mock_llmobs_eval_metric_writer):
@@ -1954,29 +1908,6 @@ def test_submit_evaluation_for_enqueues_writer_with_reasoning(llmobs, mock_llmob
             reasoning="the content of the message involved profanity",
         )
     )
-    mock_llmobs_eval_metric_writer.reset()
-    llmobs.submit_evaluation_for(
-        span={"span_id": "123", "trace_id": "456"},
-        label="toxicity",
-        metric_type="categorical",
-        value="low",
-        tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"},
-        ml_app="ml_app_override",
-        metadata="invalid",
-        reasoning="the content of the message did not involve profanity or hate speech or negativity",
-    )
-    mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
-        _expected_llmobs_eval_metric_event(
-            ml_app="ml_app_override",
-            span_id="123",
-            trace_id="456",
-            label="toxicity",
-            metric_type="categorical",
-            categorical_value="low",
-            tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
-            reasoning="the content of the message did not involve profanity or hate speech or negativity",
-        )
-    )
 
 
 def test_llmobs_parenting_with_root_apm_span(llmobs, tracer, llmobs_events):