Merge pull request #129 from hirzel/rag

hirzel · web-flow · commit f3a737b57928 · 2024-10-07T06:25:20.000-04:00
Use simple TFIDF for RAG example.
diff --git a/examples/rag/rag.pdl b/examples/rag/rag.pdl
@@ -2,21 +2,14 @@ description: Retrieval-augmented generation for NL-to-Code generation task.
 text:
 - lang: python
   code: | # initialize PDL_SESSION.vec_db and PDL_SESSION.embed() function
-    import datasets, numpy, os, requests
-    genai_key, genai_api = os.environ["WATSONX_KEY"], os.environ["WATSONX_API"]
-    def embed(text):
-        endpoint = f"{genai_api}/v1/text/embeddings?version=2024-05-02"
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {genai_key}",
-        }
-        json_data = {
-            "model_id": "sentence-transformers/all-minilm-l6-v2",
-            "input": text,
-        }
-        response = requests.post(endpoint, headers=headers, json=json_data)
-        return numpy.asarray(response.json()["results"][0])
+    import datasets, sklearn.feature_extraction.text
     train_in = datasets.load_dataset("mbpp", "sanitized", split="train")
+    corpus = [row["prompt"] for row in train_in]
+    tfidf = sklearn.feature_extraction.text.TfidfVectorizer().fit(corpus)
+    def embed(text):
+        singleton_batch = [text]
+        sparse_result = tfidf.transform(raw_documents=singleton_batch)
+        return sparse_result.toarray().flatten()
     train_em = train_in.map(lambda row: {"embeddings": embed(row["prompt"])})
     PDL_SESSION.vec_db = train_em.add_faiss_index("embeddings")
     PDL_SESSION.embed = embed
@@ -50,6 +43,6 @@ text:
 
     Q: ${ TEST_PROMPT }
     A: ```
-- model: watsonx/ibm/granite-20b-code-instruct-v2
+- model: watsonx/ibm/granite-34b-code-instruct
   parameters:
     stop: ["```"]