Add processor_runs objref, delete unused model, add logs for pipeline (#265)

vegito22 · web-flow · commit ba294e0187b7 · 2024-08-07T16:47:03.000-07:00
diff --git a/llmstack/assets/utils.py b/llmstack/assets/utils.py
@@ -3,9 +3,9 @@
 logger = logging.getLogger(__name__)
 
 
-def get_asset_by_objref(objref, request_user, request_session):
+def get_asset_by_objref_internal(objref):
     """
-    Get asset by objref if one exists and is accessible by the user.
+    Get asset by objref if one exists.
     """
     from llmstack.apps.models import AppDataAssets, AppSessionFiles
     from llmstack.data.models import DataSourceEntryFiles
@@ -29,10 +29,18 @@ def get_asset_by_objref(objref, request_user, request_session):
             return None
 
         asset = model_cls.objects.get(uuid=uuid)
-
-        if not asset or not asset.is_accessible(request_user, request_session):
-            return None
     except Exception as e:
         logger.error(f"Error retrieving asset: {e}")
 
     return asset
+
+
+def get_asset_by_objref(objref, request_user, request_session):
+    """
+    Get asset by objref if one exists and is accessible by the user.
+    """
+    asset = get_asset_by_objref_internal(objref)
+    if not asset or not asset.is_accessible(request_user, request_session):
+        return None
+
+    return asset
diff --git a/llmstack/data/pipeline.py b/llmstack/data/pipeline.py
@@ -22,11 +22,13 @@ def __init__(self, datasource: DataSource):
         self.datasource = datasource
         self._source_cls = self.datasource.pipeline_obj.source_cls
         self._destination_cls = self.datasource.pipeline_obj.destination_cls
+        logger.debug("Initializing DataIngestionPipeline")
 
         self._destination = None
         self._transformations = self.datasource.pipeline_obj.transformation_objs
         embedding_cls = self.datasource.pipeline_obj.embedding_cls
         if embedding_cls:
+            logger.debug("Initializing DataIngestionPipeline Transformation")
             embedding_additional_kwargs = {
                 **self.datasource.pipeline_obj.embedding.data.get("additional_kwargs", {}),
                 **{"datasource": datasource},
@@ -39,29 +41,29 @@ def __init__(self, datasource: DataSource):
                     }
                 )
             )
+            logger.debug("Finished Initializing DataIngestionPipeline Transformation")
 
         if self._destination_cls:
+            logger.debug("Initializing DataIngestionPipeline Destination")
             self._destination = self._destination_cls(**self.datasource.pipeline_obj.destination_data)
             self._destination.initialize_client(datasource=self.datasource, create_collection=True)
+            logger.debug("Finished Initializing DataIngestionPipeline Destination")
 
     def process(self, document: DataDocument) -> DataDocument:
+        logger.debug(f"Processing document: {document.name}")
         document = self._source_cls.process_document(document)
-        if self.datasource.pipeline_obj.embedding:
-            embedding_data = self.datasource.pipeline_obj.embedding.data
-            embedding_data["additional_kwargs"] = {
-                **embedding_data.get("additional_kwargs", {}),
-                **{"datasource": self.datasource},
-            }
-            embedding_transformer = self.datasource.pipeline_obj.embedding_cls(**embedding_data)
-            self._transformations.append(embedding_transformer)
-
+        logger.debug(f"Creating IngestionPipeline for document: {document.name}")
         ingestion_pipeline = IngestionPipeline(transformations=self._transformations)
         ldoc = LlamaDocumentShim(**document.model_dump())
         ldoc.metadata = {**ldoc.metadata, **document.metadata}
+        logger.debug(f"Running IngestionPipeline for document: {document.name}")
         document.nodes = ingestion_pipeline.run(documents=[ldoc])
+        logger.debug(f"Finished running IngestionPipeline for document: {document.name}")
         document.node_ids = list(map(lambda x: x.id_, document.nodes))
         if self._destination:
+            logger.debug(f"Adding document: {document.name} to destination")
             self._destination.add(document=document)
+            logger.debug(f"Finished adding document: {document.name} to destination")
 
         return document
 
@@ -83,39 +85,50 @@ def __init__(self, datasource: DataSource):
         self._destination_cls = self.datasource.pipeline_obj.destination_cls
         self._destination = None
         self._embedding_generator = None
+        logger.debug("Initializing DataQueryPipeline")
 
         if self._destination_cls:
+            logger.debug("Initializing DataQueryPipeline Destination")
             self._destination = self._destination_cls(**self.datasource.pipeline_obj.destination_data)
             self._destination.initialize_client(datasource=self.datasource, create_collection=False)
+            logger.debug("Finished Initializing DataQueryPipeline Destination")
 
         if self.datasource.pipeline_obj.embedding:
+            logger.debug("Initializing DataQueryPipeline Embedding")
             embedding_data = self.datasource.pipeline_obj.embedding.data
             embedding_data["additional_kwargs"] = {
                 **embedding_data.get("additional_kwargs", {}),
                 **{"datasource": self.datasource},
             }
             self._embedding_generator = self.datasource.pipeline_obj.embedding_cls(**embedding_data)
+            logger.debug("Finished Initializing DataQueryPipeline Embedding")
 
     def search(self, query: str, use_hybrid_search=True, **kwargs) -> List[dict]:
         content_key = self.datasource.destination_text_content_key
         query_embedding = None
 
+        logger.debug(f"Initializing Search for query: {query}")
+
         if kwargs.get("search_filters", None):
             raise NotImplementedError("Search filters are not supported for this data source.")
 
         documents = []
 
         if self._embedding_generator:
+            logger.debug("Generating embedding for query")
             query_embedding = self._embedding_generator.get_embedding(query)
+            logger.debug("Finished generating embedding for query")
 
         if self._destination:
+            logger.debug(f"Searching for query: {query} in destination")
             query_result = self._destination.search(
                 query=query,
                 use_hybrid_search=use_hybrid_search,
                 query_embedding=query_embedding,
                 datasource_uuid=str(self.datasource.uuid),
                 **kwargs,
             )
+            logger.debug(f"Received results for query: {query} from destination")
             documents = list(
                 map(
                     lambda x: Document(page_content_key=content_key, page_content=x.text, metadata=x.metadata),
diff --git a/llmstack/data/sources/website/url.py b/llmstack/data/sources/website/url.py
@@ -111,9 +111,11 @@ def process_document(cls, document: DataDocument) -> DataDocument:
         connection_context = (
             get_connection_context(connection_id, document.metadata["datasource_uuid"]) if connection_id else None
         )
-        html_page = get_page_html(
-            document.request_data.get("url"), connection=connection_id, storage_state=connection_context
-        )
+        url = document.name
+        if document.request_data.get("url"):
+            url = document.request_data.get("url")
+
+        html_page = get_page_html(url, connection=connection_id, storage_state=connection_context)
         page_text = extract_text(html_page)
 
         text_data_uri = (
diff --git a/llmstack/events/consumers/app_run_finished.py b/llmstack/events/consumers/app_run_finished.py
@@ -217,7 +217,6 @@ def persist_app_run_history(event_data: AppRunFinishedEventData):
         response_content_type=event_data.response_content_type,
         response_headers=event_data.response_headers,
         response_time=event_data.response_time,
-        processor_runs=event_data.processor_runs,
         platform_data=event_data.platform_data,
     )
-    run_entry.save()
+    run_entry.save(processor_runs=event_data.processor_runs)
diff --git a/llmstack/processors/admin.py b/llmstack/processors/admin.py
@@ -4,7 +4,6 @@
     ApiBackend,
     ApiProvider,
     Endpoint,
-    EndpointInvocationCount,
     Feedback,
     Request,
     Response,
@@ -16,7 +15,6 @@
 admin.site.register(ApiProvider)
 admin.site.register(ApiBackend)
 admin.site.register(Endpoint)
-admin.site.register(EndpointInvocationCount)
 admin.site.register(VersionedEndpoint)
 admin.site.register(Feedback)
 admin.site.register(Request)
diff --git a/llmstack/processors/migrations/0008_runentry_processor_runs_objref_and_more.py b/llmstack/processors/migrations/0008_runentry_processor_runs_objref_and_more.py
@@ -0,0 +1,21 @@
+# Generated by Django 4.2.14 on 2024-08-07 21:57
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('apiabstractor', '0007_runentry_apiabstract_request_fb04e3_idx_and_more'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='runentry',
+            name='processor_runs_objref',
+            field=models.CharField(blank=True, default=None, help_text='Processor runs objref', null=True),
+        ),
+        migrations.DeleteModel(
+            name='EndpointInvocationCount',
+        ),
+    ]
diff --git a/llmstack/processors/models.py b/llmstack/processors/models.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import uuid
 
@@ -7,6 +8,7 @@
 from django.db.models.signals import pre_save
 from django.dispatch import receiver
 
+from llmstack.assets.utils import get_asset_by_objref_internal
 from llmstack.common.utils.db_models import ArrayField
 
 logger = logging.getLogger(__name__)
@@ -416,6 +418,12 @@ class RunEntry(models.Model):
             help_text="Array of processor data for each endpoint including input and output data",
         )
     )
+    processor_runs_objref = models.CharField(
+        default=None,
+        blank=True,
+        null=True,
+        help_text="Processor runs objref",
+    )
     platform_data = models.JSONField(
         default=dict,
         blank=True,
@@ -442,66 +450,45 @@ def clean_dict(self, data):
             return data.replace("\u0000", "")
         return data
 
-    def clean_processor_runs(self):
-        if self.processor_runs:
-            self.processor_runs = [self.clean_dict(item) for item in self.processor_runs]
+    def clean_processor_runs(self, processor_runs=[]):
+        return [self.clean_dict(item) for item in processor_runs]
 
     def save(self, *args, **kwargs):
         # Clean the processor_runs field
-        self.clean_processor_runs()
+        processor_runs = kwargs.pop("processor_runs", [])
+        processor_runs_objref = self.create_processor_runs_objref(processor_runs)
+        self.processor_runs = []
+        self.processor_runs_objref = processor_runs_objref
         super(RunEntry, self).save(*args, **kwargs)
 
     @property
     def is_store_request(self):
         return self.app_store_uuid is not None
 
-    @staticmethod
-    def from_pinot_dict(row):
-        owner = User.objects.get(id=row["owner_id"])
-
-        return RunEntry(
-            request_uuid=row["request_uuid"],
-            app_uuid=row["app_uuid"],
-            endpoint_uuid=row["endpoint_uuid"],
-            owner=owner,
-            session_key=row["session_key"],
-            request_user_email=row["request_user_email"],
-            request_ip=row["request_ip"],
-            request_location=row["request_location"],
-            request_user_agent=row["request_user_agent"],
-            request_content_type=row["request_content_type"],
-            request_body=row["request_body"],
-            response_status=row["response_status"],
-            response_body=row["response_body"],
-            response_content_type=row["response_content_type"],
-            response_headers=row["response_headers"],
-            response_time=row["response_time"],
-            processor_runs=row["processor_runs"],
-        )
+    def create_processor_runs_objref(self, processor_runs=[]):
+        import base64
+        import json
 
+        from llmstack.apps.models import AppSessionFiles
 
-class EndpointInvocationCount(models.Model):
-    """
-    Model to track the usage of endpoints by users
-    """
+        processor_runs = self.clean_processor_runs(processor_runs)
+        processor_runs = {"processor_runs": processor_runs}
 
-    user = models.ForeignKey(
-        User,
-        on_delete=models.DO_NOTHING,
-        help_text="User this count is for",
-    )
-    month = models.CharField(
-        max_length=5,
-        help_text="Month for the count as MM-YY",
-        default="",
-    )
-    count = models.IntegerField(
-        help_text="Count for the month",
-        default=0,
-    )
+        request_uuid = str(self.request_uuid)
+        processor_runs_datauri = f"data:application/json;name={request_uuid}_processor_runs.json;base64,{base64.b64encode(json.dumps(processor_runs).encode()).decode()}"
 
-    def __str__(self):
-        return self.user.__str__() + ":" + self.month
+        session_id = self.session_key
+        processor_runs_objrefs = AppSessionFiles.create_from_data_uri(
+            data_uri=processor_runs_datauri,
+            ref_id=session_id,
+            metadata={"session_id": session_id, "request_uuid": request_uuid},
+        )
+        return processor_runs_objrefs.objref
+
+    def get_processor_runs_from_objref(self):
+        file_asset = get_asset_by_objref_internal(self.processor_runs_objref)
+        content = file_asset.file.read().decode("utf-8")
+        return json.loads(content).get("processor_runs", [])
 
 
 class Feedback(models.Model):
diff --git a/llmstack/processors/serializers.py b/llmstack/processors/serializers.py
@@ -161,6 +161,7 @@ class Meta:
 
 class HistorySerializer(serializers.ModelSerializer):
     app_detail = serializers.SerializerMethodField()
+    processor_runs = serializers.SerializerMethodField()
 
     def to_representation(self, instance):
         representation = super().to_representation(instance)
@@ -205,6 +206,18 @@ def get_app_store_app(uuid):
                     pass
         return {"name": "Deleted App", "path": "/"}
 
+    def get_processor_runs(self, obj):
+        if obj.processor_runs_objref:
+            try:
+                return obj.get_processor_runs_from_objref()
+            except Exception:
+                pass
+
+        if obj.processor_runs:
+            return obj.processor_runs
+
+        return []
+
     class Meta:
         model = RunEntry
         fields = [

Original file line number	Diff line number	Diff line change
`@@ -217,7 +217,6 @@ def persist_app_run_history(event_data: AppRunFinishedEventData):`
`217`	`217`	`response_content_type=event_data.response_content_type,`
`218`	`218`	`response_headers=event_data.response_headers,`
`219`	`219`	`response_time=event_data.response_time,`
`220`		`- processor_runs=event_data.processor_runs,`
`221`	`220`	`platform_data=event_data.platform_data,`
`222`	`221`	`)`
`223`		`- run_entry.save()`
	`222`	`+ run_entry.save(processor_runs=event_data.processor_runs)`