interpretml
diff --git a/‎README.md‎
Lines changed: 12 additions & 1 deletion b/‎README.md‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎examples/MLE-bench-contamination.ipynb‎
Lines changed: 652 additions & 61 deletions b/‎examples/MLE-bench-contamination.ipynb‎
Lines changed: 652 additions & 61 deletions
diff --git a/‎tabmemcheck/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎tabmemcheck/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tabmemcheck/chat_completion.py‎
Lines changed: 31 additions & 11 deletions b/‎tabmemcheck/chat_completion.py‎
Lines changed: 31 additions & 11 deletions
diff --git a/‎tabmemcheck/functions.py‎
Lines changed: 41 additions & 52 deletions b/‎tabmemcheck/functions.py‎
Lines changed: 41 additions & 52 deletions
@@ -95,7 +95,7 @@ tabmemcheck.run_all_tests("adult-test.csv", "gpt-4-0613")
 
 # How do the tests work?
 
-We use few-shot learning to condition chat models on the task of regurgitating their training data. This works well for GPT-3.5 and GPT-4, and also for many other LLMs (but not necessarily for all LLMs). 
+We use few-shot learning to condition chat models on the desired task. This works well for GPT-3.5 and GPT-4, and also for many other LLMs (but not necessarily for all LLMs). 
 
 You can set ```tabmemcheck.config.print_prompts = True``` to see the prompts.
 
@@ -114,6 +114,17 @@ Because one needs to weight the completions of the LLM against the entropy in th
 
 While this all sounds very complex, the practical evidence for memorization is often very clear. This can also be seen in the examples above.
 
+
+# Can I uses this package to write my own tests?
+
+This package provides two fairly general functions
+
+- ```tabmemcheck.chat_completion```
+- ```tabmemcheck.prefix_suffix_chat_completion```
+
+
+
+
 # Using the package with your own LLM
 
 To test your own LLM, simply implement ```tabmemcheck.LLM_Interface```. We use the OpenAI message format.
 
@@ -67,6 +67,9 @@ def __delattr__(self, key):
 # csv file loading options
 config.csv_max_rows = 100000        # maximum number of rows to load from a csv file
 
+# how to display test output. "cmd" or "html" for jupyter notebook html display
+config.display = "cmd"
+
 # default: no prompt/response logging
 config.current_logging_task = None
 config.current_logging_folder = None
 
@@ -35,6 +35,7 @@ def feature_values_chat_completion(
     fs_cond_feature_names=[],  # a list of lists of conditional feature names for each few-shot example
     add_description=True,
     out_file=None,
+    rng=None,
 ):
     """Feature chat completion task. This task asks the LLM to complete the feature values of observations in the dataset.
 
@@ -124,6 +125,7 @@ def feature_values_chat_completion(
         few_shot=few_shot_prefixes_suffixes,
         num_queries=num_queries,
         out_file=out_file,
+        rng=rng,
     )
 
     return test_prefixes, test_suffixes, responses
@@ -145,6 +147,7 @@ def row_chat_completion(
     few_shot=7,
     out_file=None,
     print_levenshtein=False,
+    rng=None,
 ):
     """Row  chat completion task. This task ask the LLM to predict the next row in the
     csv file, given the previous rows. This task is the basis for the row completion
@@ -171,6 +174,7 @@ def row_chat_completion(
         num_queries=num_queries,
         out_file=out_file,
         print_levenshtein=print_levenshtein,
+        rng=rng,
     )
 
     return test_prefixes, test_suffixes, responses
@@ -183,6 +187,7 @@ def row_completion(
     num_queries=100,
     out_file=None,  # TODO support out_file
     print_levenshtein=False,
+    rng=None,
 ):
     """Plain language model variant of row_chat_completion"""
     # load the file as a list of strings
@@ -192,7 +197,11 @@ def row_completion(
     prefixes = []
     suffixes = []
     responses = []
-    for idx in np.random.choice(
+
+    if rng is None:
+        rng = np.random.default_rng()
+
+    for idx in rng.choice(
         len(rows) - num_prefix_rows, num_queries, replace=False
     ):
         # prepare query
@@ -408,9 +417,7 @@ def chat_completion(
 
 
 ####################################################################################
-# Almost all of the different tests that we perform
-# can be cast in the prompt structue of
-# 'prefix-suffix chat completion'.
+# Many tests can be cast in the prompt structue of 'prefix-suffix chat completion'.
 # This is implemented by the following function.
 ####################################################################################
 
@@ -426,8 +433,7 @@ def prefix_suffix_chat_completion(
     out_file=None,
     rng=None,
 ):
-    """A basic chat completion function. Takes a list of prefixes and suffixes and a system prompt.
-    Sends {num_queries} prompts of the format
+    """A general-purpose chat completion function. Given prefixes, suffixes, and few-shot examples, this function sends {num_queries} LLM queries of the format
 
     System: <system_prompt>
         User: <prefix>          |
@@ -438,13 +444,27 @@ def prefix_suffix_chat_completion(
     User: <prefix>
     Assistant: <response> (=  test suffix?)
 
-    The num_queries prefixes and suffixes are randomly selected from the respective lists.
-    The function guarantees that the test suffix (as a complete string) is not contained in any of the few-shot prefixes or suffixes.
+    The prefixes, suffixes are and few-shot examples are randomly selected.
+    
+    This function guarantees that the test suffix (as a complete string) is not contained in any of the few-shot prefixes or suffixes (a useful sanity check, we don't want to provide the desired response anywhere in the context).
 
-    Stores the results in a csv file.
+    Args:
+        llm (LLM_Interface): The LLM.
+        prefixes (list[str]): A list of prefixes.
+        suffixes (list[str]): A list of suffixes.
+        system_prompt (str): The system prompt.
+        few_shot (_type_, optional): Either an integer, to select the given number of few-shot examples from the list of prefixes and suffixes. Or a list [([prefixes], [suffixes]), ..., ([prefixes], [suffixes])] to select one few-shot example from each list. Defaults to None.
+        num_queries (int, optional): The number of queries. Defaults to 100.
+        print_levenshtein (bool, optional): Visualize the Levenshtein string distance between test suffixes and LLM responses. Defaults to False.
+        out_file (_type_, optional): Save all queries to a CSV file. Defaults to None.
+        rng (_type_, optional): _description_. Defaults to None.
 
-    Returns: the test prefixes, test suffixes, and responses
-    """
+    Raises:
+        Exception: It an error occurs.
+
+    Returns:
+        tuple: A tuple of test prefixes, test suffixes, and responses.
+    """    
     assert len(prefixes) == len(
         suffixes
     ), "prefixes and suffixes must have the same length"
 
@@ -14,7 +14,6 @@
 from tabmemcheck.llm import (
     LLM_Interface,
     ChatWrappedLLM,
-    send_chat_completion,
     send_completion,
     bcolors,
 )
@@ -71,6 +70,15 @@ def __llm_setup(llm: Union[LLM_Interface, str]):
     return llm
 
 
+def __print_file_name(csv_file):
+    print(
+        bcolors.BOLD
+        + "File: "
+        + bcolors.ENDC
+        + f"{os.path.basename(csv_file)}"
+    )
+
+
 def __print_info(csv_file, llm, few_shot_csv_files):
     """Print some information about the csv file and the model."""
     print(
@@ -155,6 +163,8 @@ def feature_names_test(
     num_prefix_features: int = None,
     few_shot_csv_files=DEFAULT_FEW_SHOT_CSV_FILES,
     system_prompt: str = "default",
+    verbose: bool = True,
+    return_result = False,
 ):
     """Test if the model knows the names of the features in a csv file.
 
@@ -246,20 +256,13 @@ def feature_names_test(
         if idx != -1:
             response = response[:idx]
 
-    print(
-        bcolors.BOLD
-        + "Dataset: "
-        + bcolors.ENDC
-        + os.path.basename(csv_file)
-        + bcolors.BOLD
-        + "\nFeature Names: "
-        + bcolors.ENDC
-        + ", ".join(feature_names)
-        + bcolors.BOLD
-        + "\nFeature Names Test: "
-        + bcolors.ENDC
-        + utils.levenshtein_cmd(", ".join(feature_names[num_prefix_features:]), response) 
-    )
+    # prompt, continuation, response
+    test_triplet = ", ".join(feature_names[:num_prefix_features]) + ", ", ", ".join(feature_names[num_prefix_features:]), response
+    if verbose:
+        utils.display_test_result(*test_triplet, "Feature Names Test", csv_file)
+
+    if return_result:
+        return test_triplet
 
 
 ####################################################################################
@@ -297,12 +300,11 @@ def feature_values_test(
     pd.set_option('display.max_columns', None)  # Show all columns
     pd.set_option('display.width', 1000)        # Set the width to avoid wrapping
 
+    __print_file_name(csv_file)
     print(
         bcolors.BOLD
         + "Feature Values Test"
-        + "\nDataset: "
         + bcolors.ENDC
-        + os.path.basename(csv_file)
     )
     print_df = pd.concat([pd.DataFrame(sample_row).T.head(1), pd.DataFrame(row).head(1)])
     print_df.reset_index(drop=True, inplace=True)
@@ -311,7 +313,7 @@ def feature_values_test(
 
 
 ####################################################################################
-# Dataset Name (from the first rows of the csv file)
+# Dataset Name
 ####################################################################################
 
 
@@ -377,13 +379,10 @@ def dataset_name_test(
     else:
         raise NotImplementedError # TODO
 
+    __print_file_name(csv_file)
     print(
         bcolors.BOLD
-        + "Dataset: "
-        + bcolors.ENDC
-        + os.path.basename(csv_file)
-        + bcolors.BOLD
-        + "\nGenerated Dataset Name: "
+        + "Generated Dataset Name: "
         + bcolors.ENDC
         + response
     )
@@ -402,6 +401,8 @@ def header_test(
     few_shot_csv_files: list[str] = DEFAULT_FEW_SHOT_CSV_FILES,
     system_prompt: str = "default",
     verbose: bool = True,
+    return_result = False,
+    rng = None,
 ):
     """Header test for memorization.
 
@@ -423,6 +424,10 @@ def header_test(
     if system_prompt == "default":
         system_prompt = tabmem.config.system_prompts["header"]
 
+    # rng
+    if rng is None:
+        rng = np.random.default_rng()
+
     # load the csv file as a single contiguous string. also load the rows to determine offsets within the string
     data = utils.load_csv_string(csv_file, header=True)
     csv_rows = utils.load_csv_rows(csv_file, header=True)
@@ -438,7 +443,7 @@ def header_test(
     header_prompt, llm_completion = None, None
     for i_row in split_rows:
         offset = np.sum([len(row) for row in csv_rows[: i_row - 1]])
-        offset += np.random.randint(
+        offset += rng.integers(
             len(csv_rows[i_row]) // 3, 2 * len(csv_rows[i_row]) // 3
         )
         prefixes = [data[:offset]]
@@ -451,7 +456,7 @@ def header_test(
         # chat mode: use few-shot examples
         if llm.chat_mode:
             _, _, response = prefix_suffix_chat_completion(
-                llm, prefixes, suffixes, system_prompt, few_shot=few_shot, num_queries=1
+                llm, prefixes, suffixes, system_prompt, few_shot=few_shot, num_queries=1, rng=rng
             )
             response = response[0]
         else:  # otherwise, plain completion
@@ -472,34 +477,12 @@ def header_test(
             llm_completion = response
             header_completion = data[offset : offset + len(llm_completion)]
 
+    test_triplet = header_prompt, header_completion, llm_completion
     if verbose:  # print test result to console
-        print(
-            bcolors.BOLD
-            + "Dataset: "
-            + bcolors.ENDC
-            + os.path.basename(csv_file)
-            + bcolors.BOLD
-            + "\nHeader Test: "
-            + bcolors.ENDC
-            + bcolors.Black
-            + header_prompt
-            + utils.levenshtein_cmd(header_completion, llm_completion)
-            + bcolors.ENDC
-            + bcolors.BOLD
-            + "\nHeader Test Legend:  "
-            + bcolors.ENDC
-            + "Prompt "
-            + bcolors.Green
-            + "Correct "
-            + bcolors.Red
-            + "Incorrect "
-            + bcolors.ENDC
-            + bcolors.Purple
-            + "Missing"
-            + bcolors.ENDC
-        )
+        utils.display_test_result(*test_triplet, "Header Test", csv_file)
 
-    return header_prompt, header_completion, llm_completion
+    if return_result:
+        return test_triplet
 
 
 ####################################################################################
@@ -516,6 +499,7 @@ def row_completion_test(
     out_file=None,
     system_prompt: str = "default",
     print_levenshtein: bool = True,
+    rng=None,
 ):
     """Row completion test for memorization. The test resports the number of correctly completed rows.
 
@@ -571,10 +555,11 @@ def row_completion_test(
             few_shot,
             out_file,
             print_levenshtein,
+            rng=rng,
         )
     else:
         _, test_suffixes, responses = row_completion(
-            llm, csv_file, num_prefix_rows, num_queries, out_file, print_levenshtein=print_levenshtein
+            llm, csv_file, num_prefix_rows, num_queries, out_file, print_levenshtein=print_levenshtein, rng=rng
         )
 
     # count the number of verbatim completed rows
@@ -617,6 +602,7 @@ def feature_completion_test(
     few_shot=5,
     out_file=None,
     system_prompt: str = "default",
+    rng=None,
 ):
     """Feature completion test for memorization. The test resports the number of correctly completed features.
 
@@ -674,6 +660,7 @@ def build_prompt(messages):
         cond_feature_names,
         add_description=False,
         out_file=out_file,
+        rng=rng,
     )
 
     # parse the model responses
@@ -715,6 +702,7 @@ def first_token_test(
     few_shot=7,
     out_file=None,
     system_prompt: str = "default",
+    rng=None,
 ):
     """First token test for memorization. We ask the model to complete the first token of the next row of the csv file, given the previous rows. The test resports the number of correctly completed tokens.
 
@@ -781,6 +769,7 @@ def first_token_test(
             num_queries,
             few_shot,
             out_file,
+            rng=rng,
         )
     else:
         _, test_suffixes, responses = row_completion(