Skip to content

Commit e7d66ed

Browse files
committed
Further small changes
1 parent ebe3621 commit e7d66ed

File tree

6 files changed

+28
-8
lines changed

6 files changed

+28
-8
lines changed

llm-complete-guide/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ environment and install the dependencies using the following command:
4343
pip install -r requirements.txt
4444
```
4545

46+
blah blah if it fails FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn --no-build-isolation
47+
4648
In order to use the default LLM for this query, you'll need an account and an
4749
API key from OpenAI specified as another environment variable:
4850

llm-complete-guide/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ tiktoken
1717
umap-learn
1818
matplotlib
1919
pyarrow
20-
rerankers[all]
20+
rerankers[flashrank]
2121
datasets
2222
torch
2323

llm-complete-guide/run.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from materializers.document_materializer import DocumentMaterializer
4343
from pipelines import (
4444
finetune_embeddings,
45+
generate_synthetic_data,
4546
llm_basic_rag,
4647
llm_eval,
4748
)
@@ -102,6 +103,13 @@
102103
default=False,
103104
help="Disable cache.",
104105
)
106+
@click.option(
107+
"--synthetic",
108+
"synthetic",
109+
is_flag=True,
110+
default=False,
111+
help="Run the synthetic data pipeline.",
112+
)
105113
@click.option(
106114
"--local",
107115
"local",
@@ -143,6 +151,7 @@ def main(
143151
query: Optional[str] = None,
144152
model: str = OPENAI_MODEL,
145153
no_cache: bool = False,
154+
synthetic: bool = False,
146155
local: bool = False,
147156
embeddings: bool = False,
148157
dummyembeddings: bool = False,
@@ -157,6 +166,7 @@ def main(
157166
query (Optional[str]): If provided, the RAG model will be queried with this string.
158167
model (str): The model to use for the completion. Default is OPENAI_MODEL.
159168
no_cache (bool): If `True`, cache will be disabled.
169+
synthetic (bool): If `True`, the synthetic data pipeline will be run.
160170
local (bool): If `True`, the local LLM via Ollama will be used.
161171
dummyembeddings (bool): If `True`, dummyembeddings will be used
162172
embeddings (bool): If `True`, the embeddings will be fine-tuned.
@@ -188,6 +198,8 @@ def main(
188198
llm_basic_rag.with_options(**pipeline_args)()
189199
if evaluation:
190200
llm_eval.with_options(**pipeline_args)()
201+
if synthetic:
202+
generate_synthetic_data.with_options(**pipeline_args)()
191203
if embeddings:
192204
finetune_embeddings.with_options(**embeddings_finetune_args)()
193205
if dummyembeddings:

llm-complete-guide/steps/distilabel_generate_queries.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
import os
1616
from typing import Annotated, Tuple
1717

18-
import distilabel
1918
from constants import (
2019
DATASET_NAME_DEFAULT,
2120
OPENAI_MODEL_GEN,
@@ -25,6 +24,7 @@
2524
from distilabel.llms import OpenAILLM
2625
from distilabel.steps import LoadDataFromHub
2726
from distilabel.steps.tasks import GenerateSentencePair
27+
from distilabel.pipeline import Pipeline
2828
from zenml import step
2929

3030
synthetic_generation_context = """
@@ -45,7 +45,7 @@ def generate_synthetic_queries(
4545
model=OPENAI_MODEL_GEN, api_key=os.getenv("OPENAI_API_KEY")
4646
)
4747

48-
with distilabel.pipeline.Pipeline(
48+
with Pipeline(
4949
name="generate_embedding_queries"
5050
) as pipeline:
5151
load_dataset = LoadDataFromHub(
@@ -74,7 +74,7 @@ def generate_synthetic_queries(
7474
}
7575
},
7676
},
77-
# use_cache=False, # comment out for demo
77+
use_cache=False, # comment out for demo
7878
)
7979

8080
test_distiset = pipeline.run(
@@ -89,7 +89,7 @@ def generate_synthetic_queries(
8989
}
9090
},
9191
},
92-
# use_cache=False, # comment out for demo
92+
use_cache=False, # comment out for demo
9393
)
9494

9595
train_dataset = train_distiset["default"]["train"]

llm-complete-guide/steps/eval_retrieval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def perform_retrieval_evaluation(
198198

199199
if all(url_ending not in url for url in urls):
200200
logging.error(
201-
f"Failed for question: {question}. Expected URL ending: {url_ending}. Got: {urls}"
201+
f"Failed for question: {question}. Expected URL containing: {url_ending}. Got: {urls}"
202202
)
203203
failures += 1
204204

llm-complete-guide/steps/url_scraping_utils.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ def is_valid_url(url: str, base: str) -> bool:
4848
return not re.search(version_pattern, url)
4949

5050

51+
def strip_query_params(url):
52+
return url.split('?')[0]
53+
54+
5155
def get_all_pages(url: str) -> List[str]:
5256
"""
5357
Retrieve all pages with the same base as the given URL.
@@ -61,9 +65,11 @@ def get_all_pages(url: str) -> List[str]:
6165
logger.info(f"Scraping all pages from {url}...")
6266
base_url = urlparse(url).netloc
6367
pages = crawl(url, base_url)
64-
logger.info(f"Found {len(pages)} pages.")
68+
stripped_pages = [strip_query_params(page) for page in pages]
69+
70+
logger.info(f"Found {len(stripped_pages)} pages.")
6571
logger.info("Done scraping pages.")
66-
return list(pages)
72+
return list(stripped_pages)
6773

6874

6975
def crawl(url: str, base: str, visited: Set[str] = None) -> Set[str]:

0 commit comments

Comments
 (0)