NOISSUE - Update to SMQ v0.19.1 (#222)

SammyOina · web-flow · commit e31d9cc3c10a · 2026-03-12T13:38:58.000+01:00
* feat: update environment variables for local development and enhance proxy request handling

Signed-off-by: Sammy Oina &lt;sammyoina@gmail.com&gt;

* chore: update release tag for Docker image and fix supermq dependency version

Signed-off-by: Sammy Oina &lt;sammyoina@gmail.com&gt;

* feat: update environment variables and target URLs for internal services

Signed-off-by: Sammy Oina &lt;sammyoina@gmail.com&gt;

---------

Signed-off-by: Sammy Oina &lt;sammyoina@gmail.com&gt;
diff --git a/docker/.env b/docker/.env
@@ -187,7 +187,7 @@ SMQ_EMAIL_FROM_ADDRESS=__SMQ_EMAIL_FROM_ADDRESS__
 SMQ_EMAIL_FROM_NAME=Cube AI
 
 # Docker image tag
-SMQ_RELEASE_TAG=latest
+SMQ_RELEASE_TAG=v0.19.1
 
 # Proxy
 UV_CUBE_PROXY_LOG_LEVEL=debug
@@ -360,3 +360,7 @@ MG_UI_RELEASE_TAG=latest
 MG_BACKEND_RELEASE_TAG=latest
 MG_BACKEND_URL=http://magistrala-backend:9097
 MG_UI_IMAGE_URL=http://magistrala-backend:9097  
+
+# LLM response timeouts
+UV_CUBE_PROXY_SERVER_WRITE_TIMEOUT=120s
+UV_CUBE_PROXY_SERVER_READ_TIMEOUT=120s
diff --git a/docker/cube-compose.yaml b/docker/cube-compose.yaml
@@ -222,6 +222,8 @@ services:
       UV_CUBE_PROXY_DB_PASS: ${UV_CUBE_PROXY_DB_PASS}
       UV_CUBE_PROXY_DB_NAME: ${UV_CUBE_PROXY_DB_NAME}
       SMQ_ALLOW_UNVERIFIED_USER: ${SMQ_ALLOW_UNVERIFIED_USER}
+      UV_CUBE_PROXY_SERVER_WRITE_TIMEOUT: ${UV_CUBE_PROXY_SERVER_WRITE_TIMEOUT}
+      UV_CUBE_PROXY_SERVER_READ_TIMEOUT: ${UV_CUBE_PROXY_SERVER_READ_TIMEOUT}
       UV_CUBE_PROXY_DB_SSL_MODE: ${UV_CUBE_PROXY_DB_SSL_MODE}
       UV_CUBE_PROXY_DB_SSL_CERT: ${UV_CUBE_PROXY_DB_SSL_CERT}
       UV_CUBE_PROXY_DB_SSL_KEY: ${UV_CUBE_PROXY_DB_SSL_KEY}
diff --git a/go.mod b/go.mod
@@ -5,7 +5,7 @@ go 1.26.0
 require (
 	github.com/absmach/callhome v0.18.2
 	github.com/absmach/certs v0.18.2
-	github.com/absmach/supermq v0.19.1-0.20260311095911-28ae84286e16
+	github.com/absmach/supermq v0.19.1
 	github.com/caarlos0/env/v11 v11.4.0
 	github.com/go-chi/chi/v5 v5.2.5
 	github.com/go-kit/kit v0.13.0
diff --git a/go.sum b/go.sum
@@ -16,8 +16,8 @@ github.com/absmach/callhome v0.18.2 h1:dmopRHm2qTheHN1hdUKRRYpKwRrj7X9d8AWCFrb+K
 github.com/absmach/callhome v0.18.2/go.mod h1:LEXKhES9JJtj3tBgTZv7VPNjOi5ukJQB0mFic0QP60Q=
 github.com/absmach/certs v0.18.2 h1:s6KKL3/KfDZ6z0IxvNCksIOUwRnEgQyCpeAonuR15No=
 github.com/absmach/certs v0.18.2/go.mod h1:scqVZsmW2xPScnpMTtE70oN6cn0LLjFcJVPi4JKZ4+E=
-github.com/absmach/supermq v0.19.1-0.20260311095911-28ae84286e16 h1:C8ekYx3p/uFlXa6WA6Vg1IRZVloGOOi0OqT3QkBQIMI=
-github.com/absmach/supermq v0.19.1-0.20260311095911-28ae84286e16/go.mod h1:UCC6/UIRhO70inBIzlwC1Cm/wCyQj+yuP4dhMZSjXIc=
+github.com/absmach/supermq v0.19.1 h1:uLrf1fXpn0W6BkSSaa+d1Kw0KXygSfNn+b2EqFpCiMA=
+github.com/absmach/supermq v0.19.1/go.mod h1:UCC6/UIRhO70inBIzlwC1Cm/wCyQj+yuP4dhMZSjXIc=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/caarlos0/env/v11 v11.4.0 h1:Kcb6t5kIIr4XkoQC9AF2j+8E1Jsrl3Wz/hhm1LtoGAc=
diff --git a/guardrails/rails/general.co b/guardrails/rails/general.co
@@ -4,19 +4,19 @@
 import core
 
 # ---------------------------------------------------------------------------
-# Guard-only NeMo configuration.
+# NeMo Guardrails Configuration
 #
-# NeMo is used EXCLUSIVELY for input/output safety guardrails (pattern
-# matching).  No LLM calls are made inside NeMo.
+# Guards run as high-priority background flows.  When a guard matches, its
+# refusal message is returned immediately.
 #
-# Response generation is handled by the Python router:
-#   - Greetings / goodbye / capabilities → Python canned responses
-#   - All other messages → direct LLM call (bypassing NeMo flow engine)
+# For messages that pass all guards, the low-priority `passthrough response`
+# flow fires and returns a "<<GUARDRAILS_PASS>>" marker so the Python
+# router knows NeMo approved the message.  This avoids the ~5 s delay
+# previously caused by `wait indefinitely` leaving NeMo's event loop
+# with no response to emit.
 #
-# This avoids NeMo's Colang 2.x state-machine issues with smaller models
-# (llama3.2:3b) where intent classification returns names without the
-# required "user " prefix, causing `continuation on unhandled user intent`
-# to never fire.
+# Response generation for approved messages is handled by the Python router
+# which calls the LLM directly after receiving the pass marker.
 # ---------------------------------------------------------------------------
 
 flow main
@@ -27,5 +27,21 @@ flow main
   # Output-side guard — catches unsafe bot output patterns.
   activate output guard
 
+  # Fast catch-all for approved messages (see docstring above).
+  activate passthrough response
+
   # Keep main alive so the activated flows persist across turns.
   wait indefinitely
+
+
+# ---------------------------------------------------------------------------
+# Low-priority catch-all: fires for any user message that passed all guards.
+# Produces a marker so NeMo returns in <1 s instead of ~5 s.
+# Priority 0.1 ensures guards (0.9 / 0.8) always take precedence.
+# ---------------------------------------------------------------------------
+
+@loop("passthrough_loop")
+flow passthrough response
+  priority 0.1
+  user said something
+  bot say "<<GUARDRAILS_PASS>>"
diff --git a/guardrails/src/adapters/llm/cube_llm.py b/guardrails/src/adapters/llm/cube_llm.py
@@ -101,6 +101,27 @@ def _get_model_from_context(self) -> Optional[str]:
             logger.debug("CubeLLM: generation_options_var not set for model")
             return None
 
+    def _get_base_url_from_context(self) -> Optional[str]:
+        """Read per-request base_url from NeMo context.
+
+        The proxy forwards the caller's domain ID to guardrails via the
+        ``X-Domain-ID`` header.  Guardrails then includes it in
+        ``llm_params["base_url"]`` so that LLM calls are routed back
+        through the proxy with the correct ``/{domainID}/v1/…`` path
+        prefix, which is needed for domain-level authentication.
+        """
+        try:
+            gen_options = generation_options_var.get()
+            if gen_options and gen_options.llm_params:
+                base_url = gen_options.llm_params.get("base_url")
+                if base_url:
+                    logger.debug(f"CubeLLM: found base_url in context: {base_url}")
+                    return str(base_url)
+            return None
+        except LookupError:
+            logger.debug("CubeLLM: generation_options_var not set for base_url")
+            return None
+
     def _merge_headers(self) -> Dict[str, str]:
         base_headers = self._config_headers or {}
         request_headers = self._get_headers_from_context()
@@ -153,7 +174,14 @@ async def _agenerate(
         model = context_model or self.model_name
         logger.debug(f"CubeLLM._agenerate: model='{model}', from_context={context_model is not None}")
 
-        base_url = self._normalized_base_url
+        # Support per-request base_url from context (needed for domain-prefixed proxy URLs)
+        context_base_url = self._get_base_url_from_context()
+        if context_base_url:
+            if not context_base_url.endswith("/v1"):
+                context_base_url = f"{context_base_url.rstrip('/')}/v1"
+            base_url = context_base_url
+        else:
+            base_url = self._normalized_base_url
 
         try:
             # Create a temporary client to inject per-request headers
@@ -200,7 +228,14 @@ def _generate(
         model = context_model or self.model_name
         logger.debug(f"CubeLLM._generate: model='{model}', from_context={context_model is not None}")
 
-        base_url = self._normalized_base_url
+        # Support per-request base_url from context (needed for domain-prefixed proxy URLs)
+        context_base_url = self._get_base_url_from_context()
+        if context_base_url:
+            if not context_base_url.endswith("/v1"):
+                context_base_url = f"{context_base_url.rstrip('/')}/v1"
+            base_url = context_base_url
+        else:
+            base_url = self._normalized_base_url
 
         try:
             temp_client = ChatOpenAI(
diff --git a/guardrails/src/adapters/runtime/nemo_runtime.py b/guardrails/src/adapters/runtime/nemo_runtime.py
@@ -114,6 +114,14 @@ async def swap(self, materialized: MaterializedGuardrail) -> None:
 
                 new_rails = LLMRails(rails_config)
 
+                # Reduce max_events from the default 500.
+                # Guards and the passthrough flow fire within ~30 events;
+                # the remaining hundreds are cascading UnhandledEvent noise
+                # that add seconds of latency with no functional benefit.
+                if hasattr(new_rails, "runtime") and hasattr(new_rails.runtime, "max_events"):
+                    new_rails.runtime.max_events = 100
+                    logger.info("Set NeMo runtime max_events to 100 (was 500)")
+
                 # Atomic swap
                 self._rails = new_rails
                 self._revision = materialized.revision
diff --git a/guardrails/src/drivers/rest/routers/guardrails.py b/guardrails/src/drivers/rest/routers/guardrails.py
diff --git a/proxy/api/transport.go b/proxy/api/transport.go