Skip to content

Commit f3a737b

Browse files
authored
Merge pull request #129 from hirzel/rag
Use simple TFIDF for RAG example.
2 parents dec3eef + 6a2f80a commit f3a737b

File tree

1 file changed

+8
-15
lines changed

1 file changed

+8
-15
lines changed

examples/rag/rag.pdl

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,14 @@ description: Retrieval-augmented generation for NL-to-Code generation task.
22
text:
33
- lang: python
44
code: | # initialize PDL_SESSION.vec_db and PDL_SESSION.embed() function
5-
import datasets, numpy, os, requests
6-
genai_key, genai_api = os.environ["WATSONX_KEY"], os.environ["WATSONX_API"]
7-
def embed(text):
8-
endpoint = f"{genai_api}/v1/text/embeddings?version=2024-05-02"
9-
headers = {
10-
"Content-Type": "application/json",
11-
"Authorization": f"Bearer {genai_key}",
12-
}
13-
json_data = {
14-
"model_id": "sentence-transformers/all-minilm-l6-v2",
15-
"input": text,
16-
}
17-
response = requests.post(endpoint, headers=headers, json=json_data)
18-
return numpy.asarray(response.json()["results"][0])
5+
import datasets, sklearn.feature_extraction.text
196
train_in = datasets.load_dataset("mbpp", "sanitized", split="train")
7+
corpus = [row["prompt"] for row in train_in]
8+
tfidf = sklearn.feature_extraction.text.TfidfVectorizer().fit(corpus)
9+
def embed(text):
10+
singleton_batch = [text]
11+
sparse_result = tfidf.transform(raw_documents=singleton_batch)
12+
return sparse_result.toarray().flatten()
2013
train_em = train_in.map(lambda row: {"embeddings": embed(row["prompt"])})
2114
PDL_SESSION.vec_db = train_em.add_faiss_index("embeddings")
2215
PDL_SESSION.embed = embed
@@ -50,6 +43,6 @@ text:
5043

5144
Q: ${ TEST_PROMPT }
5245
A: ```
53-
- model: watsonx/ibm/granite-20b-code-instruct-v2
46+
- model: watsonx/ibm/granite-34b-code-instruct
5447
parameters:
5548
stop: ["```"]

0 commit comments

Comments
 (0)