|
| 1 | +import json |
| 2 | +import logging |
| 3 | +import os |
| 4 | +import pathlib |
| 5 | +import re |
| 6 | + |
| 7 | +import rich |
| 8 | +from azure.identity import AzureDeveloperCliCredential, get_bearer_token_provider |
| 9 | +from azure.search.documents import SearchClient |
| 10 | +from dotenv_azd import load_azd_env |
| 11 | +from langchain_community.document_loaders import JSONLoader |
| 12 | +from langchain_core.documents import Document as LCDocument |
| 13 | +from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings |
| 14 | +from ragas.embeddings import LangchainEmbeddingsWrapper |
| 15 | +from ragas.llms import LangchainLLMWrapper |
| 16 | +from ragas.testset import TestsetGenerator |
| 17 | +from ragas.testset.graph import KnowledgeGraph, Node, NodeType |
| 18 | +from ragas.testset.transforms import apply_transforms, default_transforms |
| 19 | + |
| 20 | +logger = logging.getLogger("evals") |
| 21 | + |
| 22 | +load_azd_env() |
| 23 | +root_dir = pathlib.Path(__file__).parent.parent |
| 24 | + |
| 25 | + |
| 26 | +def get_azure_credential(): |
| 27 | + AZURE_TENANT_ID = os.getenv("AZURE_TENANT_ID") |
| 28 | + if AZURE_TENANT_ID: |
| 29 | + logger.info("Setting up Azure credential using AzureDeveloperCliCredential with tenant_id %s", AZURE_TENANT_ID) |
| 30 | + azure_credential = AzureDeveloperCliCredential(tenant_id=AZURE_TENANT_ID, process_timeout=60) |
| 31 | + else: |
| 32 | + logger.info("Setting up Azure credential using AzureDeveloperCliCredential for home tenant") |
| 33 | + azure_credential = AzureDeveloperCliCredential(process_timeout=60) |
| 34 | + return azure_credential |
| 35 | + |
| 36 | + |
| 37 | +def get_search_documents(azure_credential) -> str: |
| 38 | + search_client = SearchClient( |
| 39 | + endpoint=f"https://{os.getenv('AZURE_SEARCH_SERVICE')}.search.windows.net", |
| 40 | + index_name=os.getenv("AZURE_SEARCH_INDEX"), |
| 41 | + credential=azure_credential, |
| 42 | + ) |
| 43 | + search_results = search_client.search(search_text="*", top=10) |
| 44 | + return [result for result in search_results] |
| 45 | + |
| 46 | + |
| 47 | +path = root_dir / "data/Json_Examples/2189.json" |
| 48 | +loader = JSONLoader(path, jq_schema=".Description") |
| 49 | +docs = loader.load() |
| 50 | + |
| 51 | + |
| 52 | +azure_credential = get_azure_credential() |
| 53 | +azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION") or "2024-06-01" |
| 54 | +azure_endpoint = f"https://{os.getenv('AZURE_OPENAI_SERVICE')}.openai.azure.com" |
| 55 | +azure_ad_token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default") |
| 56 | +generator_llm = LangchainLLMWrapper( |
| 57 | + AzureChatOpenAI( |
| 58 | + openai_api_version=azure_openai_api_version, |
| 59 | + azure_endpoint=azure_endpoint, |
| 60 | + azure_ad_token_provider=azure_ad_token_provider, |
| 61 | + azure_deployment=os.getenv("AZURE_OPENAI_EVAL_DEPLOYMENT"), |
| 62 | + model=os.environ["AZURE_OPENAI_EVAL_MODEL"], |
| 63 | + validate_base_url=False, |
| 64 | + ) |
| 65 | +) |
| 66 | + |
| 67 | +# init the embeddings for answer_relevancy, answer_correctness and answer_similarity |
| 68 | +generator_embeddings = LangchainEmbeddingsWrapper( |
| 69 | + AzureOpenAIEmbeddings( |
| 70 | + openai_api_version=azure_openai_api_version, |
| 71 | + azure_endpoint=azure_endpoint, |
| 72 | + azure_ad_token_provider=azure_ad_token_provider, |
| 73 | + azure_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"), |
| 74 | + model=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"], |
| 75 | + ) |
| 76 | +) |
| 77 | + |
| 78 | +# Let's make a knowledge_graph from Azure AI Search documents |
| 79 | +search_docs = get_search_documents(azure_credential) |
| 80 | + |
| 81 | +# create the transforms |
| 82 | +transforms = default_transforms( |
| 83 | + documents=[LCDocument(page_content=doc["content"]) for doc in search_docs], |
| 84 | + llm=generator_llm, |
| 85 | + embedding_model=generator_embeddings, |
| 86 | +) |
| 87 | + |
| 88 | +# convert the documents to Ragas nodes |
| 89 | +nodes = [] |
| 90 | +for doc in search_docs: |
| 91 | + content = doc["content"] |
| 92 | + citation = doc["sourcepage"] |
| 93 | + node = Node( |
| 94 | + type=NodeType.DOCUMENT, |
| 95 | + properties={ |
| 96 | + "page_content": f"[[{citation}]]: {content}", |
| 97 | + "document_metadata": {"citation": citation}, |
| 98 | + }, |
| 99 | + ) |
| 100 | + nodes.append(node) |
| 101 | + |
| 102 | +kg = KnowledgeGraph(nodes=nodes) |
| 103 | +apply_transforms(kg, transforms) |
| 104 | + |
| 105 | +generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings, knowledge_graph=kg) |
| 106 | +dataset = generator.generate(testset_size=10, with_debugging_logs=True) |
| 107 | + |
| 108 | +qa_pairs = [] |
| 109 | +for sample in dataset.samples: |
| 110 | + rich.print(sample) |
| 111 | + question = sample.eval_sample.user_input |
| 112 | + truth = sample.eval_sample.reference |
| 113 | + # Grab the citation in square brackets from the reference_contexts and add it to the truth |
| 114 | + citations = [] |
| 115 | + for context in sample.eval_sample.reference_contexts: |
| 116 | + match = re.search(r"\[\[(.*?)\]\]", context) |
| 117 | + if match: |
| 118 | + citation = match.group(1) |
| 119 | + citations.append(f"[{citation}]") |
| 120 | + truth += " " + " ".join(citations) |
| 121 | + qa_pairs.append({"question": question, "truth": truth}) |
| 122 | + |
| 123 | +with open(root_dir / "ground_truth_ragas.jsonl", "a") as f: |
| 124 | + for qa_pair in qa_pairs: |
| 125 | + f.write(json.dumps(qa_pair) + "\n") |
0 commit comments