dataset name test, config.csv_max_rows and other fixes for very large csv files

sbordt · sbordt · commit 46ca8959b129 · 2024-10-16T20:09:43.000+02:00
diff --git a/examples/MLE-bench-contamination.ipynb b/examples/MLE-bench-contamination.ipynb
diff --git a/tabmemcheck/__init__.py b/tabmemcheck/__init__.py
@@ -15,6 +15,8 @@
     run_all_tests,
     header_test,
     feature_names_test,
+    feature_values_test,
+    dataset_name_test,
     row_completion_test,
     feature_completion_test,
     first_token_test,
@@ -59,9 +61,12 @@ def __delattr__(self, key):
 
 # default llm options
 config.temperature = 0
-config.max_tokens = 500
+config.max_tokens = 1000
 config.sleep = 0.0  # amount of time to sleep after each query to the llm
 
+# csv file loading options
+config.csv_max_rows = 100000        # maximum number of rows to load from a csv file
+
 # default: no prompt/response logging
 config.current_logging_task = None
 config.current_logging_folder = None
diff --git a/tabmemcheck/analysis.py b/tabmemcheck/analysis.py
@@ -63,9 +63,7 @@ def find_matches(
 ):
     """Find the closest matches between a row x and all rows in the dataframe df. By default, we use the levenshtein distance as the distance metric.
 
-    This function can handle a variety of formatting differences between the values in the original data
-    and LLM responses that should still be counted as equal.
-
+    This function can handle some formatting differences between the values in the original data and LLM responses that should still be counted as equal.
 
     :param df: a pandas dataframe.
     :param x: a string, a pandas dataframe or a pandas Series.
diff --git a/tabmemcheck/functions.py b/tabmemcheck/functions.py
@@ -38,9 +38,9 @@
 ]
 
 
-def __difflib_similar(csv_file_1, csv_file_2):
+def __difflib_similar(csv_file_1, csv_file_2, max_length=5000):
     sm = SequenceMatcher(
-        None, utils.load_csv_string(csv_file_1), utils.load_csv_string(csv_file_2)
+        None, utils.load_csv_string(csv_file_1, size=max_length)[:max_length], utils.load_csv_string(csv_file_2, size=max_length)[:max_length]
     )
     if sm.quick_ratio() > 0.9:
         return sm.ratio() > 0.9
@@ -49,33 +49,19 @@ def __difflib_similar(csv_file_1, csv_file_2):
 
 def __validate_few_shot_files(csv_file, few_shot_csv_files):
     """check if the csv_file is contained in the few_shot_csv_files."""
-    dataset_name = utils.get_dataset_name(csv_file)
-    few_shot_names = [utils.get_dataset_name(x) for x in few_shot_csv_files]
-    if dataset_name in few_shot_names:
-        # replace the dataset with iris or adult 
-        few_shot_csv_files = [
-            x for x in few_shot_csv_files if utils.get_dataset_name(x) != dataset_name
-        ]
-        if 'iris' in dataset_name:
-            few_shot_csv_files.append("adult-train.csv")
-        else:
-            few_shot_csv_files.append("iris.csv")
-        print(
-                bcolors.BOLD
-                + "Info: "
-                + bcolors.ENDC
-                + f"Exchanged a few-shot datasets because its name is similar to the dataset being tested."
-            )
-    # now test with difflib if the dataset contents are very similar
+    validated_few_shot_files = []
+    # test with difflib if the dataset contents are very similar
     for fs_file in few_shot_csv_files:
         if __difflib_similar(csv_file, fs_file):
             print(
                 bcolors.BOLD
-                + "Warning: "
+                + "Info: "
                 + bcolors.ENDC
-                + f"The dataset is very similar to the few-shot dataset {utils.get_dataset_name(fs_file)}."
+                + f"Removed the few-shot dataset {fs_file} because it is similar to the dataset being tested."
             )
-    return few_shot_csv_files
+        else:
+            validated_few_shot_files.append(fs_file)
+    return validated_few_shot_files
 
 
 def __llm_setup(llm: Union[LLM_Interface, str]):
@@ -193,9 +179,6 @@ def feature_names_test(
     if num_prefix_features is None:
         num_prefix_features = max(1, len(feature_names) // 4)
 
-    # remove the current csv file from the few-shot csv files should it be present there
-    few_shot_csv_files = [x for x in few_shot_csv_files if not dataset_name in x]
-
     # setup for the few-shot examples
     fs_dataset_names = [utils.get_dataset_name(x) for x in few_shot_csv_files]
     fs_feature_names = [
@@ -265,13 +248,17 @@ def feature_names_test(
 
     print(
         bcolors.BOLD
-        + "Feature Names Test\nFeature Names:    "
+        + "Dataset: "
         + bcolors.ENDC
-        + ", ".join(feature_names[num_prefix_features:])
+        + os.path.basename(csv_file)
         + bcolors.BOLD
-        + "\nModel Generation: "
+        + "\nFeature Names: "
         + bcolors.ENDC
-        + response
+        + ", ".join(feature_names)
+        + bcolors.BOLD
+        + "\nFeature Names Test: "
+        + bcolors.ENDC
+        + utils.levenshtein_cmd(", ".join(feature_names[num_prefix_features:]), response) 
     )
 
 
@@ -280,6 +267,128 @@ def feature_names_test(
 ####################################################################################
 
 
+def feature_values_test(
+    csv_file: str,
+    llm: Union[LLM_Interface, str],
+    few_shot_csv_files=DEFAULT_FEW_SHOT_CSV_FILES,
+    system_prompt: str = "default",
+):
+    """Test if the model knows valid feature values for the features in a csv file. Asks the model to provide samples, then compares the sampled feature values to the values in the csv file.
+
+    :param csv_file: The path to the csv file.
+    :param llm: The language model to be tested.
+    :param few_shot_csv_files: A list of other csv files to be used as few-shot examples.
+    :param system_prompt: The system prompt to be used.
+    """
+
+    # first, sample 3 observations at temperature zero
+    samples_df = sample(csv_file, llm, num_queries=3, temperature=0.0, few_shot_csv_files=few_shot_csv_files, system_prompt=system_prompt)
+
+     # check that there is at least one valid sample
+    if samples_df.empty:
+        print("Error: The LLM was not able to provide valid samples.")
+        return
+    
+    # choose the first sample
+    sample_row = samples_df.iloc[0]
+    _, row = analysis.find_matches(utils.load_csv_df(csv_file), sample_row)
+
+    # Set pandas display options for better formatting
+    pd.set_option('display.max_columns', None)  # Show all columns
+    pd.set_option('display.width', 1000)        # Set the width to avoid wrapping
+
+    print(
+        bcolors.BOLD
+        + "Feature Values Test"
+        + "\nDataset: "
+        + bcolors.ENDC
+        + os.path.basename(csv_file)
+    )
+    print_df = pd.concat([pd.DataFrame(sample_row).T.head(1), pd.DataFrame(row).head(1)])
+    print_df.reset_index(drop=True, inplace=True)
+    print_df.rename(index={0: bcolors.BOLD + "Model Sample" + bcolors.ENDC, 1: bcolors.BOLD + "Dataset Match"  + bcolors.ENDC}, inplace=True)
+    print(print_df)
+
+
+####################################################################################
+# Dataset Name (from the first rows of the csv file)
+####################################################################################
+
+
+def dataset_name_test(
+    csv_file: str,
+    llm: Union[LLM_Interface, str],
+    few_shot_csv_files=DEFAULT_FEW_SHOT_CSV_FILES,
+    few_shot_dataset_names=None,
+    num_rows = 5,
+    header=True,
+    system_prompt: str = "default",
+):
+    """Test if the model knows the names of the features in a csv file.
+
+    :param csv_file: The path to the csv file.
+    :param llm: The language model to be tested.
+    :param num_prefix_features: The number of features given to the model as part of the prompt (defaults to 1/4 of the features).
+    :param few_shot_csv_files: A list of other csv files to be used as few-shot examples.
+    :param few_shot_dataset_names: A list of dataset names to be used as few-shot examples. If None, the dataset names are are the file names of the few-shot csv files.
+    :num_rows: The number of dataset rows to be given to the model as part of the prompt.
+    :header: If True, the first row of the csv file is included in the prompt (it usually contains the feature names).
+    :param system_prompt: The system prompt to be used.
+    """
+
+    llm = __llm_setup(llm)
+    few_shot_csv_files = __validate_few_shot_files(csv_file, few_shot_csv_files)
+
+    # default system prompt?
+    if system_prompt == "default":
+        system_prompt = tabmem.config.system_prompts["dataset-name"]
+
+    if few_shot_dataset_names is None:
+        few_shot_dataset_names = [utils.get_dataset_name(x) for x in few_shot_csv_files]
+
+    if llm.chat_mode:
+        # construt the prompt
+        prefixes = [
+            "\n".join(utils.load_csv_rows(csv_file, header=header)[:num_rows])
+        ]
+        suffixes = [utils.get_dataset_name(csv_file)]
+
+        few_shot = []
+        for fs_csv_file, dataset_name in zip(few_shot_csv_files, few_shot_dataset_names):
+            few_shot.append(
+                (
+                    [
+                        "\n".join(utils.load_csv_rows(fs_csv_file, header=header)[:num_rows])
+                    ],
+                    [dataset_name],
+                )
+            )
+
+        # execute the the prompt
+        _, _, responses = prefix_suffix_chat_completion(
+            llm,
+            prefixes,
+            suffixes,
+            system_prompt,
+            few_shot=few_shot,
+            num_queries=1,
+        )
+        response = responses[0]
+    else:
+        raise NotImplementedError # TODO
+
+    print(
+        bcolors.BOLD
+        + "Dataset: "
+        + bcolors.ENDC
+        + os.path.basename(csv_file)
+        + bcolors.BOLD
+        + "\nGenerated Dataset Name: "
+        + bcolors.ENDC
+        + response
+    )
+        
+
 ####################################################################################
 # Header Test
 ####################################################################################
@@ -366,7 +475,11 @@ def header_test(
     if verbose:  # print test result to console
         print(
             bcolors.BOLD
-            + "Header Test: "
+            + "Dataset: "
+            + bcolors.ENDC
+            + os.path.basename(csv_file)
+            + bcolors.BOLD
+            + "\nHeader Test: "
             + bcolors.ENDC
             + bcolors.Black
             + header_prompt
@@ -422,6 +535,13 @@ def row_completion_test(
     if system_prompt == "default":  # default system prompt?
         system_prompt = tabmem.config.system_prompts["row-completion"]
 
+    print(
+        bcolors.BOLD
+        + "Dataset: "
+        + bcolors.ENDC
+        + os.path.basename(csv_file)
+    )
+
     # what fraction of the rows are duplicates?
     rows = utils.load_csv_rows(csv_file)
     frac_duplicates = 1 - len(set(rows)) / len(rows)
@@ -717,6 +837,7 @@ def sample(
     csv_file: str,
     llm: Union[LLM_Interface, str],
     num_queries: int,
+    temperature: float = 0.7,
     few_shot_csv_files: list[str] = DEFAULT_FEW_SHOT_CSV_FILES,
     cond_feature_names: list[str] = [],
     drop_invalid_responses: bool = True,
@@ -742,6 +863,10 @@ def sample(
     if not llm.chat_mode:  # wrap base model to take chat queries
         llm = ChatWrappedLLM(llm, build_sample_prompt, ends_with="\n\n")
 
+    # store the temperature
+    temp = tabmem.config.temperature
+    tabmem.config.temperature = temperature
+
     # run the test
     _, _, responses = feature_values_chat_completion(
         llm,
@@ -754,6 +879,9 @@ def sample(
         out_file=None,
     )
 
+    # reset the temperature
+    tabmem.config.temperature = temp
+
     if len(cond_feature_names) > 0:
         raise NotImplementedError("Conditional sampling not yet supported.")
         # TODO handle the condtional case!
diff --git a/tabmemcheck/resources/config/system-prompts.yaml b/tabmemcheck/resources/config/system-prompts.yaml
@@ -21,5 +21,8 @@ generic-csv-format: |
 feature-names: |
   You are an expert assistant for tabular datasets. Your task is to list the names of the features of different datasets. The user provides a description of the dataset and some of the feature names. You then provide the names of the remaining features.
 
+dataset-name: |
+  You are an expert assistant for tabular datasets. Your task is to provide the name of the dataset. The user provides the initial rows of the csv file, inlcuding the feature names. You then provide the name of the dataset.
+
 predict: |
   You are an expert assistant for tabular datasets. You provide predictions on different datasets. The user provides the name of the dataset, the names of the features, as well the values of all the features except one. You then provide a prediction for the missing feature (the target).
diff --git a/tabmemcheck/utils.py b/tabmemcheck/utils.py
@@ -6,6 +6,7 @@
 import jellyfish
 import difflib
 import tempfile
+import itertools
 
 import csv
 
@@ -80,14 +81,23 @@ def get_feature_names(csv_file):
         return df.columns.tolist()
 
 
+CSV_MAX_ROWS_WARNING_PRINTED = False
 def load_csv_df(csv_file, header=True, delimiter="auto", **kwargs):
+    global CSV_MAX_ROWS_WARNING_PRINTED
     """Load a csv file as a pandas data frame."""
     with _csv_file(csv_file) as csv_file:
         # auto detect the delimiter from the csv file
         if delimiter == "auto":
             delimiter = get_delimiter(csv_file)
         # load the csv file
-        df = pd.read_csv(csv_file, delimiter=delimiter, **kwargs)
+        max_rows = tabmem.config.csv_max_rows
+        df = pd.read_csv(csv_file, delimiter=delimiter, nrows=max_rows+1, **kwargs)
+        # Check if the file has more rows than n
+        if len(df) > max_rows and not CSV_MAX_ROWS_WARNING_PRINTED:
+            print(f'Info: Found a CSV file with more than {max_rows} rows. Note that tabmemcheck is configured to use only the first {max_rows} rows. Set tabmemcheck.config.csv_max_rows to change this behavior.')
+            CSV_MAX_ROWS_WARNING_PRINTED = True
+        # Truncate the dataframe to the first n rows
+        df = df.head(max_rows)
         # optionally, remove the header
         if not header:
             df = df.iloc[1:]
@@ -96,9 +106,15 @@ def load_csv_df(csv_file, header=True, delimiter="auto", **kwargs):
 
 def load_csv_rows(csv_file, header=True):
     """Load a csv file as a list of strings, with one string per row."""
+    global CSV_MAX_ROWS_WARNING_PRINTED
     with _csv_file(csv_file) as csv_file:
         with open(csv_file, "r") as f:
-            data = f.readlines()
+            data = list(itertools.islice(f, tabmem.config.csv_max_rows+1))
+        # check if the file has more rows than n, if yes print warning and reduce to n
+        if len(data) > tabmem.config.csv_max_rows and not CSV_MAX_ROWS_WARNING_PRINTED:
+            print(f'Info: Found a CSV file with more than {tabmem.config.csv_max_rows} rows. Note that tabmemcheck is configured to use only the first {tabmem.config.csv_max_rows} rows. Set tabmemcheck.config.csv_max_rows to change this behavior.')
+            CSV_MAX_ROWS_WARNING_PRINTED = True
+        data = data[:tabmem.config.csv_max_rows]
         # remove all trailing newlines
         data = [line.rstrip("\n") for line in data]
         # remove all empty rows
@@ -109,12 +125,12 @@ def load_csv_rows(csv_file, header=True):
         return data
 
 
-def load_csv_string(csv_file, header=True):
+def load_csv_string(csv_file, header=True, size=10000000):
     """Load a csv file as a single string."""
     with _csv_file(csv_file) as csv_file:
         # load the csv file into a single string
         with open(csv_file, "r") as f:
-            data = f.read()
+            data = f.read(size)
         # remove header TODO, this currently only works if header does not contain "\n"
         if not header:
             data = data.split("\n")[1:]