@@ -2,21 +2,14 @@ description: Retrieval-augmented generation for NL-to-Code generation task.
22text:
33- lang: python
44 code: | # initialize PDL_SESSION.vec_db and PDL_SESSION.embed() function
5- import datasets, numpy, os, requests
6- genai_key, genai_api = os.environ["WATSONX_KEY"], os.environ["WATSONX_API"]
7- def embed(text):
8- endpoint = f"{genai_api}/v1/text/embeddings?version=2024-05-02"
9- headers = {
10- "Content-Type": "application/json",
11- "Authorization": f"Bearer {genai_key}",
12- }
13- json_data = {
14- "model_id": "sentence-transformers/all-minilm-l6-v2",
15- "input": text,
16- }
17- response = requests.post(endpoint, headers=headers, json=json_data)
18- return numpy.asarray(response.json()["results"][0])
5+ import datasets, sklearn.feature_extraction.text
196 train_in = datasets.load_dataset("mbpp", "sanitized", split="train")
7+ corpus = [row["prompt"] for row in train_in]
8+ tfidf = sklearn.feature_extraction.text.TfidfVectorizer().fit(corpus)
9+ def embed(text):
10+ singleton_batch = [text]
11+ sparse_result = tfidf.transform(raw_documents=singleton_batch)
12+ return sparse_result.toarray().flatten()
2013 train_em = train_in.map(lambda row: {"embeddings": embed(row["prompt"])})
2114 PDL_SESSION.vec_db = train_em.add_faiss_index("embeddings")
2215 PDL_SESSION.embed = embed
5043
5144 Q: ${ TEST_PROMPT }
5245 A: ```
53- - model: watsonx/ibm/granite-20b -code-instruct-v2
46+ - model: watsonx/ibm/granite-34b -code-instruct
5447 parameters:
5548 stop: ["```"]
0 commit comments