Skip to content

Commit 6b76cd3

Browse files
authored
docs: added a quickstart nb (#41)
1 parent efbecf8 commit 6b76cd3

File tree

8 files changed

+372
-318
lines changed

8 files changed

+372
-318
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,4 @@ experiments/**/storage
166166
**/fil-result/
167167
experiments/baselines/fiqa/datasets
168168
src/ragas/_version.py
169+
.python-version

README.md

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,18 +55,23 @@ This is a small example program you can run to see ragas in action!
5555
```python
5656

5757
from ragas import evaluate
58+
from datasets import Dataset
5859
import os
5960

6061
os.environ["OPENAI_API_KEY"] = "your-openai-key"
6162

62-
ds = Dataset({
63-
features: ['question','context','answer'],
64-
num_rows: 25
65-
})
66-
results = evaluate(ds)
63+
# prepare your huggingface dataset in the format
64+
# Dataset({
65+
# features: ['question','contexts','answer'],
66+
# num_rows: 25
67+
# })
68+
69+
dataset: Dataset
70+
71+
results = evaluate(dataset)
6772

6873
```
69-
If you want a more in-depth explanation of core components, check out our quick-start notebook
74+
If you want a more in-depth explanation of core components, check out our [quick-start notebook](./examples/quickstart.ipynb)
7075
## :luggage: Metrics
7176

7277
Ragas measures your pipeline's performance against two dimensions

examples/quickstart.ipynb

Lines changed: 208 additions & 185 deletions
Large diffs are not rendered by default.

experiments/assesments/metrics_assesments.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
"metadata": {},
6565
"outputs": [],
6666
"source": [
67-
"os.chdir('/Users/shahules/belar/src/')"
67+
"os.chdir(\"/Users/shahules/belar/src/\")"
6868
]
6969
},
7070
{

experiments/baselines/fiqa/dataset-exploration-and-baseline.ipynb

Lines changed: 33 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,11 @@
4848
"from beir.datasets.data_loader import GenericDataLoader\n",
4949
"\n",
5050
"dataset = \"fiqa\"\n",
51-
"url = \"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip\".format(dataset)\n",
51+
"url = (\n",
52+
" \"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip\".format(\n",
53+
" dataset\n",
54+
" )\n",
55+
")\n",
5256
"data_path = util.download_and_unzip(url, \"datasets\")"
5357
]
5458
},
@@ -218,7 +222,7 @@
218222
"source": [
219223
"with open(os.path.join(data_path, \"corpus.jsonl\")) as f:\n",
220224
" cs = [pd.Series(json.loads(l)) for l in f.readlines()]\n",
221-
" \n",
225+
"\n",
222226
"corpus_df = pd.DataFrame(cs)\n",
223227
"corpus_df"
224228
]
@@ -299,9 +303,7 @@
299303
}
300304
],
301305
"source": [
302-
"corpus_df = corpus_df.rename(columns={\n",
303-
" \"_id\": \"corpus-id\", \"text\": \"ground_truth\"\n",
304-
"})\n",
306+
"corpus_df = corpus_df.rename(columns={\"_id\": \"corpus-id\", \"text\": \"ground_truth\"})\n",
305307
"corpus_df = corpus_df.drop(columns=[\"title\", \"metadata\"])\n",
306308
"corpus_df[\"corpus-id\"] = corpus_df[\"corpus-id\"].astype(int)\n",
307309
"corpus_df.head()"
@@ -387,9 +389,7 @@
387389
" qs = [pd.Series(json.loads(l)) for l in f.readlines()]\n",
388390
"\n",
389391
"queries_df = pd.DataFrame(qs)\n",
390-
"queries_df = queries_df.rename(columns={\n",
391-
" \"_id\": \"query-id\", \"text\": \"question\"\n",
392-
"})\n",
392+
"queries_df = queries_df.rename(columns={\"_id\": \"query-id\", \"text\": \"question\"})\n",
393393
"queries_df = queries_df.drop(columns=[\"metadata\"])\n",
394394
"queries_df[\"query-id\"] = queries_df[\"query-id\"].astype(int)\n",
395395
"queries_df.head()"
@@ -474,10 +474,10 @@
474474
"splits = [\"dev\", \"test\", \"train\"]\n",
475475
"split_df = {}\n",
476476
"for s in splits:\n",
477-
" split_df[s] = pd.read_csv(\n",
478-
" os.path.join(data_path, f\"qrels/{s}.tsv\"), sep=\"\\t\"\n",
479-
" ).drop(columns=[\"score\"])\n",
480-
" \n",
477+
" split_df[s] = pd.read_csv(os.path.join(data_path, f\"qrels/{s}.tsv\"), sep=\"\\t\").drop(\n",
478+
" columns=[\"score\"]\n",
479+
" )\n",
480+
"\n",
481481
"split_df[\"dev\"].head()"
482482
]
483483
},
@@ -515,10 +515,14 @@
515515
" df = queries_df.merge(split_df[split], on=\"query-id\")\n",
516516
" df = df.merge(corpus_df, on=\"corpus-id\")\n",
517517
" df = df.drop(columns=[\"corpus-id\"])\n",
518-
" grouped = df.groupby('query-id').apply(lambda x: pd.Series({\n",
519-
" 'question': x['question'].sample().values[0],\n",
520-
" 'ground_truths': x['ground_truth'].tolist()\n",
521-
" }))\n",
518+
" grouped = df.groupby(\"query-id\").apply(\n",
519+
" lambda x: pd.Series(\n",
520+
" {\n",
521+
" \"question\": x[\"question\"].sample().values[0],\n",
522+
" \"ground_truths\": x[\"ground_truth\"].tolist(),\n",
523+
" }\n",
524+
" )\n",
525+
" )\n",
522526
"\n",
523527
" grouped = grouped.reset_index()\n",
524528
" grouped = grouped.drop(columns=\"query-id\")\n",
@@ -797,11 +801,8 @@
797801
"assert os.path.exists(path_to_ds_repo), f\"{path_to_ds_repo} doesnot exist!\"\n",
798802
"\n",
799803
"for s in final_split_df:\n",
800-
" final_split_df[s].to_csv(\n",
801-
" os.path.join(path_to_ds_repo, f\"{s}.csv\"),\n",
802-
" index=False\n",
803-
" )\n",
804-
" \n",
804+
" final_split_df[s].to_csv(os.path.join(path_to_ds_repo, f\"{s}.csv\"), index=False)\n",
805+
"\n",
805806
"corpus_df.to_csv(os.path.join(path_to_ds_repo, \"corpus.csv\"), index=False)"
806807
]
807808
},
@@ -1009,18 +1010,11 @@
10091010
"from llama_index.node_parser import SimpleNodeParser\n",
10101011
"from langchain.text_splitter import TokenTextSplitter\n",
10111012
"\n",
1012-
"spliter = TokenTextSplitter(\n",
1013-
" chunk_size = 100,\n",
1014-
" chunk_overlap = 50\n",
1015-
")\n",
1013+
"spliter = TokenTextSplitter(chunk_size=100, chunk_overlap=50)\n",
10161014
"\n",
1017-
"parser = SimpleNodeParser(\n",
1018-
" text_splitter=spliter\n",
1019-
")\n",
1015+
"parser = SimpleNodeParser(text_splitter=spliter)\n",
10201016
"\n",
1021-
"nodes = parser.get_nodes_from_documents(\n",
1022-
" documents=docs\n",
1023-
")"
1017+
"nodes = parser.get_nodes_from_documents(documents=docs)"
10241018
]
10251019
},
10261020
{
@@ -1088,16 +1082,12 @@
10881082
"source": [
10891083
"# create index\n",
10901084
"index = GPTVectorStoreIndex.from_documents(\n",
1091-
" documents=docs, \n",
1085+
" documents=docs,\n",
10921086
" service_context=openai_sc,\n",
10931087
")\n",
10941088
"\n",
10951089
"# query with embed_model specified\n",
1096-
"qe = index.as_query_engine(\n",
1097-
" mode=\"embedding\", \n",
1098-
" verbose=True, \n",
1099-
" service_context=openai_sc\n",
1100-
")"
1090+
"qe = index.as_query_engine(mode=\"embedding\", verbose=True, service_context=openai_sc)"
11011091
]
11021092
},
11031093
{
@@ -1171,10 +1161,7 @@
11711161
"\n",
11721162
"# query with embed_model specified\n",
11731163
"qe = index.as_query_engine(\n",
1174-
" mode=\"embedding\", \n",
1175-
" verbose=True, \n",
1176-
" service_context=openai_sc,\n",
1177-
" use_async = False\n",
1164+
" mode=\"embedding\", verbose=True, service_context=openai_sc, use_async=False\n",
11781165
")"
11791166
]
11801167
},
@@ -1195,15 +1182,13 @@
11951182
"\n",
11961183
"# configure retriever\n",
11971184
"retriever = VectorIndexRetriever(\n",
1198-
" index=index, \n",
1185+
" index=index,\n",
11991186
" similarity_top_k=3,\n",
12001187
")\n",
12011188
"\n",
12021189
"# configure response synthesizer\n",
12031190
"response_synthesizer = ResponseSynthesizer.from_args(\n",
1204-
" node_postprocessors=[\n",
1205-
" SimilarityPostprocessor(similarity_cutoff=0.7)\n",
1206-
" ]\n",
1191+
" node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)]\n",
12071192
")\n",
12081193
"\n",
12091194
"# assemble query engine\n",
@@ -1257,9 +1242,10 @@
12571242
" r = qe.query(row[\"question\"])\n",
12581243
" row[\"answer\"] = r.response\n",
12591244
" row[\"contexts\"] = [sn.node.text for sn in r.source_nodes]\n",
1260-
" \n",
1245+
"\n",
12611246
" return row\n",
12621247
"\n",
1248+
"\n",
12631249
"# generate_response(test_ds[0])"
12641250
]
12651251
},
@@ -1530,10 +1516,7 @@
15301516
"from ragas.metrics import factuality, answer_relevancy, context_relevancy\n",
15311517
"from ragas import evaluate\n",
15321518
"\n",
1533-
"evaluate(\n",
1534-
" gen_ds, \n",
1535-
" metrics=[factuality, answer_relevancy, context_relevancy]\n",
1536-
")"
1519+
"evaluate(gen_ds, metrics=[factuality, answer_relevancy, context_relevancy])"
15371520
]
15381521
},
15391522
{

0 commit comments

Comments
 (0)