yuval6957
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎GptQA/KnowledgeEmbedding.py‎
Lines changed: 279 additions & 0 deletions b/‎GptQA/KnowledgeEmbedding.py‎
Lines changed: 279 additions & 0 deletions
diff --git a/‎GptQA/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎GptQA/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎GptQA/_modidx.py‎
Lines changed: 29 additions & 1 deletion b/‎GptQA/_modidx.py‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎GptQA/core.py‎
Lines changed: 0 additions & 7 deletions b/‎GptQA/core.py‎
Lines changed: 0 additions & 7 deletions
@@ -149,3 +149,4 @@ checklink/cookies.txt
 
 # Quarto
 .quarto
+tokens.json
@@ -0,0 +1,279 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_KnowledgeEmbedding.ipynb.
+
+# %% auto 0
+__all__ = ['remove_newlines', 'text2data', 'tokenize_and_split', 'tokenize_data', 'embed_data', 'mean_pooling', 'run_embeddings',
+           'cosine_similarity', 'top_scores', 'create_context', 'answer_question']
+
+# %% ../nbs/01_KnowledgeEmbedding.ipynb 3
+import os
+import glob
+import pandas as pd
+from tqdm.auto import tqdm
+from typing import List, Dict, Set, Union, Callable
+import torch
+import numpy as np
+from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
+import torch.nn.functional as F
+from functools import partial
+import transformers
+
+
+# %% ../nbs/01_KnowledgeEmbedding.ipynb 4
+def remove_newlines(text: str) -> str:
+    return " ".join(text.split())
+    
+
+# %% ../nbs/01_KnowledgeEmbedding.ipynb 5
+def text2data(path: str, extensions: Union[str, Set[str]] = {"txt"}, recursive: bool = False) -> List[Dict[str, str]]:
+    """
+    Go over all the text files in a folder and create a list of dicts with the file name and the cleaned text.
+    """
+    texts = []
+
+    if isinstance(extensions, str) and extensions:
+        extensions = {extensions}
+
+    path = os.path.join(path, "**", "*")
+    files = [f for f in glob.glob(path, recursive=recursive) if (not extensions) or (f.split('.')[-1] in extensions and os.path.isfile(f))]
+
+    for file in tqdm(files):
+        try:
+            with open(file, "r", encoding="UTF-8") as f:
+                name_value = os.path.relpath(file, start=os.path.dirname(path))
+                name_value = name_value.replace("-", " ").replace("_", " ").replace("#update", "")
+                texts.append({'source': name_value, 'data': name_value + '.' + remove_newlines(f.read())})
+        except Exception as e:
+            print('error:', e)
+
+    return texts
+
+
+
+# %% ../nbs/01_KnowledgeEmbedding.ipynb 7
+def tokenize_and_split(text: str, tokenizer, max_tokens: int = 500, sentence_sep: str = '. ') -> List[list]:
+    ''' 
+    Function to encode the text and split the text into chunks of a maximum number of tokens 
+    '''
+    sentences = text.split(sentence_sep)
+    tokens_list = [tokenizer.encode(" " + sentence + '.') for sentence in sentences]
+
+    chunks = []
+    chunk = []
+
+    for tokens in tokens_list:
+        if len(tokens) > max_tokens:
+            chunks.extend(tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens))
+        elif len(chunk) + len(tokens) > max_tokens:
+            chunks.append(chunk)
+            chunk = []
+        else:
+            chunk.extend(tokens)
+
+    if chunk:
+        chunks.append(chunk)
+
+    return chunks
+
+        
+
+# %% ../nbs/01_KnowledgeEmbedding.ipynb 8
+def tokenize_data(data: List[Dict[str, str]], tokenizer, max_tokens: int = 500, sentence_sep: str = '. ') -> List[Dict[str,  List[int]]]:
+    ''' 
+    Function taking a list of dicts of the form {'source': file_name , 'data':clean_text}
+    split the text into chunks of size < max_tokens and return a list of dicts of the form {'source': file_name , 'tokens':list of int}
+    '''
+    tokenized = [{'source': line['source'], 'tokens': chunk}  
+                 for line in data 
+                 for chunk in tokenize_and_split(line['data'], tokenizer, max_tokens, sentence_sep)]
+    return tokenized
+
+
+# %% ../nbs/01_KnowledgeEmbedding.ipynb 9
+def embed_data(data: List[Dict[str, List[int]]], embedding_model: Callable[[List[int]], np.array]) -> List[dict]:
+    '''
+    This function takes a list of tokenized sentences. Each sentence is inside a dict of the form {'source': file_name , 'tokens':list of int}
+    The function adds the embedding to the dict.
+    The function embedding_model should accept a list of integers and return a 1D torch.Tensor.
+    '''
+    return [{'source': line['source'], 'tokens': line['tokens'], 'embeddings': embedding_model(line['tokens'])} for line in tqdm(data)]
+
+    
+
+# %% ../nbs/01_KnowledgeEmbedding.ipynb 10
+def mean_pooling(model_output, attention_mask):
+    '''
+    Mean Pooling - Take attention mask into account for correct averaging
+    '''
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+
+# %% ../nbs/01_KnowledgeEmbedding.ipynb 11
+def run_embeddings(tokenized : List[int], model : Callable[[torch.Tensor, torch.Tensor], torch.Tensor]) -> np.array:
+    '''
+    Run the embedding model on one tokenized sentence
+    ''' 
+    tokenized_tensor = torch.tensor(tokenized).unsqueeze(0).to(model.device)
+    attention_mask = torch.ones_like(tokenized_tensor).to(model.device)
+    
+    # Compute token embeddings
+    with torch.no_grad():
+        model_output = model(input_ids = tokenized_tensor, attention_mask = attention_mask)
+        
+    sentence_embeddings = mean_pooling(model_output, attention_mask)
+    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+    
+    return sentence_embeddings.squeeze().to('cpu').numpy()
+
+
+# %% ../nbs/01_KnowledgeEmbedding.ipynb 13
+def cosine_similarity(a: np.array, b: np.array) -> float:
+    '''
+    Calculate the Cosine Similarity between 2 numpy vectors.
+    '''
+    # Ensure the inputs are numpy arrays
+    a, b = np.asarray(a), np.asarray(b)
+
+    # Check if the inputs are not empty and have the same shape
+    if a.size == 0 or b.size == 0:
+        raise ValueError("Input arrays should not be empty.")
+    if a.shape != b.shape:
+        raise ValueError("Input arrays should have the same shape.")
+    
+    # Compute norms and handle zero division
+    norm_a, norm_b = np.linalg.norm(a), np.linalg.norm(b)
+    if norm_a == 0 or norm_b == 0:
+        raise ValueError("Input arrays should not be zero vectors.")
+        
+    return np.dot(a, b) / (norm_a * norm_b)
+
+# %% ../nbs/01_KnowledgeEmbedding.ipynb 14
+def top_scores(question: str, embedded: List[dict], model : Callable, tokenizer : Callable, n: int = 5, same_th: float = 0.2):
+    '''
+    Return n top answers.
+    '''
+    # Ensure n is greater than 0
+    if n <= 0:
+        raise ValueError("The number of top answers should be greater than 0.")
+
+    # Ensure same_th is within the appropriate range
+    if not 0 <= same_th < 1:
+        raise ValueError("The same_th parameter should be in the range [0, 1).")
+
+    tokenized_question = tokenizer(question)['input_ids']
+    embedded_question = run_embeddings(tokenized_question, model)
+
+    # Compute distances and get the indices of the sorted elements
+    dist = np.array([cosine_similarity(embedded_question, embed['embeddings']) for embed in embedded])
+    nearest = dist.argsort()[::-1]
+
+    # Initialize list of top answers
+    answers = [nearest[0]]
+
+    # Skip the first index (0) since it's already in the list of answers
+    for i in nearest[1:]:
+        # Compute minimum distance to already chosen answers
+        min_dist = min(cosine_similarity(embedded[i]['embeddings'], embedded[a]['embeddings']) for a in answers)
+
+        # If the minimum distance is greater than the threshold, add the answer to the list
+        if min_dist > same_th:
+            answers.append(i)
+            # Break the loop if we have enough answers
+            if len(answers) == n:
+                break
+
+    # Return the indices of the top answers
+    return answers
+
+
+# %% ../nbs/01_KnowledgeEmbedding.ipynb 15
+def create_context(question: str, embedded: List[dict], model : Callable, tokenizer: Callable, max_len: int = 1700, **kwargs) -> str:
+    """
+    Create a context for a question by finding the most similar context from the embedded data.
+    """
+
+    # Get top answers
+    answers = top_scores(question, embedded, tokenizer = tokenizer, model = model, **kwargs)
+    
+    context_parts = []
+    cur_len = 0
+
+    # Add the text to the context until the max_len is reached
+    for ans in answers:
+        # Calculate the length of the current answer's tokens
+        ans_len = len(embedded[ans]['tokens']) + 4  # accounting for the "###" separator and newlines
+        
+        # Check if adding the current answer exceeds the max_len
+        if cur_len + ans_len > max_len:
+            break
+        
+        # Add current answer's tokens to context_parts and update cur_len
+        context_parts.append(tokenizer.decode(embedded[ans]['tokens']))
+        cur_len += ans_len
+
+    # Join context_parts with separator and return the context
+    context = "\n\n###\n\n".join(context_parts)
+
+    return context
+
+
+# %% ../nbs/01_KnowledgeEmbedding.ipynb 17
+def answer_question(question: str, 
+                    embedded: List[dict], 
+                    context_model: Callable,
+                    context_tokenizer: Callable, 
+                    model: Callable,
+                    tokenizer: Callable,
+                    max_len: int = 1700, 
+                    max_added_tokens: int = 150, 
+                    temperature: float = 0.3,
+                    debug: bool = False) -> str:
+    """
+    Generate an answer to a question based on the most similar context found in the embedded data.
+
+    Parameters:
+    question (str): The question to answer.
+    embedded (List[dict]): List of embedded data.
+    context_model (Callable): Model used to create context.
+    context_tokenizer (Callable): Tokenizer used with context model.
+    model (Callable): Model used to generate answers.
+    tokenizer (Callable): Tokenizer used with answer model.
+    max_len (int): Maximum length for the context. Defaults to 1700.
+    max_added_tokens (int): Maximum number of new tokens for the generated answer. Defaults to 150.
+    temperature (float): Temperature for the answer generation. Defaults to 0.3.
+    debug (bool): If True, print the generated context.
+
+    Returns:
+    str: Generated answer.
+    """
+
+    # Create context for the question
+    context = create_context(question, embedded, model=context_model, tokenizer=context_tokenizer, max_len=max_len)
+
+    if debug:
+        print("Generated Context:\n" + context + "\n\n")
+        
+    # Format the prompt for the model
+    prompt_format = ("<human>: Answer the question based on the context below, and if the question can't be answered "
+                     "based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}<bot>:")
+    prompt = prompt_format.format(context=context, question=question)
+
+    # Prepare model inputs
+    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
+    input_length = inputs['input_ids'].shape[1]
+
+    # Generate model outputs
+    outputs = model.generate(**inputs,
+                             max_length=input_length+max_added_tokens, 
+                             do_sample=True, 
+                             temperature=temperature, 
+                             top_p=0.7, 
+                             top_k=50)
+
+    # Decode model output tokens
+    output_tokens = outputs[0, input_length:]
+    output_str = tokenizer.decode(output_tokens, skip_special_tokens=True)
+    
+    return output_str
+
@@ -1 +1,2 @@
 __version__ = "0.0.1"
+from .crowler import crawl
@@ -5,4 +5,32 @@
                 'doc_host': 'https://yuval6957.github.io',
                 'git_url': 'https://github.com/yuval6957/GptQA',
                 'lib_path': 'GptQA'},
-  'syms': {'GptQA.core': {'GptQA.core.foo': ('core.html#foo', 'GptQA/core.py')}}}
+  'syms': { 'GptQA.KnowledgeEmbedding': { 'GptQA.KnowledgeEmbedding.answer_question': ( 'knowledgeembedding.html#answer_question',
+                                                                                        'GptQA/KnowledgeEmbedding.py'),
+                                          'GptQA.KnowledgeEmbedding.cosine_similarity': ( 'knowledgeembedding.html#cosine_similarity',
+                                                                                          'GptQA/KnowledgeEmbedding.py'),
+                                          'GptQA.KnowledgeEmbedding.create_context': ( 'knowledgeembedding.html#create_context',
+                                                                                       'GptQA/KnowledgeEmbedding.py'),
+                                          'GptQA.KnowledgeEmbedding.embed_data': ( 'knowledgeembedding.html#embed_data',
+                                                                                   'GptQA/KnowledgeEmbedding.py'),
+                                          'GptQA.KnowledgeEmbedding.mean_pooling': ( 'knowledgeembedding.html#mean_pooling',
+                                                                                     'GptQA/KnowledgeEmbedding.py'),
+                                          'GptQA.KnowledgeEmbedding.remove_newlines': ( 'knowledgeembedding.html#remove_newlines',
+                                                                                        'GptQA/KnowledgeEmbedding.py'),
+                                          'GptQA.KnowledgeEmbedding.run_embeddings': ( 'knowledgeembedding.html#run_embeddings',
+                                                                                       'GptQA/KnowledgeEmbedding.py'),
+                                          'GptQA.KnowledgeEmbedding.text2data': ( 'knowledgeembedding.html#text2data',
+                                                                                  'GptQA/KnowledgeEmbedding.py'),
+                                          'GptQA.KnowledgeEmbedding.tokenize_and_split': ( 'knowledgeembedding.html#tokenize_and_split',
+                                                                                           'GptQA/KnowledgeEmbedding.py'),
+                                          'GptQA.KnowledgeEmbedding.tokenize_data': ( 'knowledgeembedding.html#tokenize_data',
+                                                                                      'GptQA/KnowledgeEmbedding.py'),
+                                          'GptQA.KnowledgeEmbedding.top_scores': ( 'knowledgeembedding.html#top_scores',
+                                                                                   'GptQA/KnowledgeEmbedding.py')},
+            'GptQA.crowler': { 'GptQA.crowler.HyperlinkParser': ('crowler.html#hyperlinkparser', 'GptQA/crowler.py'),
+                               'GptQA.crowler.HyperlinkParser.__init__': ('crowler.html#hyperlinkparser.__init__', 'GptQA/crowler.py'),
+                               'GptQA.crowler.HyperlinkParser.handle_starttag': ( 'crowler.html#hyperlinkparser.handle_starttag',
+                                                                                  'GptQA/crowler.py'),
+                               'GptQA.crowler.crawl': ('crowler.html#crawl', 'GptQA/crowler.py'),
+                               'GptQA.crowler.get_domain_hyperlinks': ('crowler.html#get_domain_hyperlinks', 'GptQA/crowler.py'),
+                               'GptQA.crowler.get_hyperlinks': ('crowler.html#get_hyperlinks', 'GptQA/crowler.py')}}}
Original file line number	Diff line number	Diff line change
`@@ -149,3 +149,4 @@ checklink/cookies.txt`
`149`	`149`
`150`	`150`	`# Quarto`
`151`	`151`	`.quarto`
	`152`	`+tokens.json`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`__version__ = "0.0.1"`
	`2`	`+from .crowler import crawl`