feat: added README & updated scripts

Darinochka · Darinochka · commit 1a2fe6ccd1a5 · 2024-11-16T15:14:03.000+03:00
diff --git a/scripts/experiments/README.md b/scripts/experiments/README.md
@@ -0,0 +1,73 @@
+## Overview
+This script automates the process of evaluating datasets with multiple metrics, either in a multilabel or multiclass configuration. It iterates through datasets in a specified directory and applies a set of metrics to each dataset. The script is designed to work with `autointent` and updates configuration files before processing each dataset.
+
+---
+
+## Features
+- Processes datasets for **multilabel** or **multiclass** scenarios based on user input.
+- Supports multiple metrics:
+  - **Multilabel metrics**:
+    - `scoring_accuracy`
+    - `scoring_f1`
+    - `scoring_log_likelihood`
+    - `scoring_precision`
+    - `scoring_recall`
+    - `scoring_roc_auc`
+    - `scoring_neg_ranking_loss`
+    - `scoring_neg_coverage`
+    - `scoring_hit_rate`
+  - **Multiclass metrics**:
+    - `scoring_accuracy`
+    - `scoring_f1`
+    - `scoring_log_likelihood`
+    - `scoring_precision`
+    - `scoring_recall`
+    - `scoring_roc_auc`
+- Automatically handles configuration updates using `update_metric.sh`.
+- Logs processing results and skips datasets gracefully on errors.
+
+---
+
+## Requirements
+- **Dependencies**:
+  - `autointent` must be installed and available in the PATH.
+  - `yq` is required for processing YAML files. Ensure it is installed and available in the PATH.
+
+    ### Installing `yq`
+
+    #### Linux
+    ```
+    wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq
+    chmod +x /usr/bin/yq
+    ```
+    #### macOS
+    ```
+    brew install yq
+    ```
+
+- **Input Files**:
+  - JSON files located in the directory specified by `<DATA_PATH>`.
+
+---
+
+## Usage
+From root repo:
+```
+sh scripts/experiments/generate_experiments.sh <DATA_PATH> <LOG_PATH> <USE_MULTILABEL>
+```
+Parameters
+
+    <DATA_PATH>: Path to the directory containing dataset JSON files.
+    <LOG_PATH>: Directory where logs for each dataset will be saved.
+    <USE_MULTILABEL>: Boolean flag (true or false) indicating whether to use multilabel metrics.
+
+## Example
+```
+sh scripts/experiments/generate_experiments.sh data/intent_records_regexp/ experiments/dnnc/ false
+```
+
+This command processes all JSON files in `data/intent_records_regexp/` using multiclass metrics, saving logs in `experiments/dnnc/`.
+
+## Notes
+
+- Ensure the path to update_metric.sh is correct. Adjust the CONFIG_SCRIPT_PATH variable if needed.
diff --git a/scripts/experiments/generate_experiments.sh b/scripts/experiments/generate_experiments.sh
@@ -1,46 +1,74 @@
 #!/bin/bash
 
-DATA_PATH="experiments/intent_description"
-LOG_PATH="experiments/intent_description/multilabel"
-METRIC="scoring_hit_rate"
-USE_MULTILABEL=true
-CONFIG_SCRIPT_PATH="./update_metric.sh"
-
-for FILE in "$DATA_PATH"/*.json; do
-  FILENAME=$(basename "$FILE" .json)
-  DATASET_NAME=$(echo "$FILENAME" | sed 's/_fix.*//')
-
-  # Determine the appropriate multilabel flag for the metric update script
-  if [ "$USE_MULTILABEL" = true ]; then
-    MULTILABEL_ARG="true"
-  else
-    MULTILABEL_ARG="false"
-  fi
-
-  # Update the metric in the configuration file
-  echo "Updating metric for dataset: $DATASET_NAME"
-  $CONFIG_SCRIPT_PATH "$METRIC" "$MULTILABEL_ARG"
-  if [ $? -ne 0 ]; then
-    echo "Error updating metric for $DATASET_NAME. Exiting."
-    exit 1
-  fi
-
-  rm -rf runs/
-
-  echo "Processing dataset: $DATASET_NAME"
-  autointent data.train_path="$FILE" \
-             logs.dirpath="$LOG_PATH/${DATASET_NAME}_${METRIC}" \
-             seed=42 \
-             vector_index.device=cuda \
-             hydra.job_logging.root.level=INFO \
-             data.force_multilabel="$USE_MULTILABEL"
-
-  if [ $? -ne 0 ]; then
-    echo "Error encountered while processing $FILE. Exiting."
-    exit 1
-  else
-    echo "Successfully processed $FILE"
-  fi
+# Check for the required arguments
+if [ "$#" -ne 3 ]; then
+  echo "Usage: $0 <DATA_PATH> <LOG_PATH> <USE_MULTILABEL>"
+  exit 1
+fi
+
+# Read arguments
+DATA_PATH="$1"
+LOG_PATH="$2"
+USE_MULTILABEL="$3"
+CONFIG_SCRIPT_PATH="scripts/experiments/update_metric.sh"
+
+# Define metrics for multilabel and multiclass
+if [ "$USE_MULTILABEL" = true ]; then
+  METRICS=(
+    "scoring_accuracy"
+    "scoring_f1"
+    "scoring_log_likelihood"
+    "scoring_precision"
+    "scoring_recall"
+    "scoring_roc_auc"
+    "scoring_neg_ranking_loss"
+    "scoring_neg_coverage"
+    "scoring_hit_rate"
+  )
+else
+  METRICS=(
+    "scoring_accuracy"
+    "scoring_f1"
+    "scoring_log_likelihood"
+    "scoring_precision"
+    "scoring_recall"
+    "scoring_roc_auc"
+  )
+fi
+
+# Iterate through each metric
+for METRIC in "${METRICS[@]}"; do
+  echo "Processing with metric: $METRIC"
+
+  for FILE in "$DATA_PATH"/*.json; do
+    FILENAME=$(basename "$FILE" .json)
+    DATASET_NAME=$(echo "$FILENAME" | sed 's/_fix.*//')
+
+    # Update the metric in the configuration file
+    echo "Updating metric for dataset: $DATASET_NAME"
+    $CONFIG_SCRIPT_PATH "$METRIC" "$USE_MULTILABEL"
+    if [ $? -ne 0 ]; then
+      echo "Error updating metric for $DATASET_NAME with metric: $METRIC. Exiting."
+      exit 1
+    fi
+
+    rm -rf runs/
+
+    echo "Processing dataset: $DATASET_NAME with metric: $METRIC"
+    autointent data.train_path="$FILE" \
+               logs.dirpath="$LOG_PATH/${DATASET_NAME}_${METRIC}" \
+               seed=42 \
+               vector_index.device=cuda \
+               hydra.job_logging.root.level=INFO \
+               data.force_multilabel="$USE_MULTILABEL"
+
+    if [ $? -ne 0 ]; then
+      echo "Error encountered while processing $FILE with metric: $METRIC. Exiting."
+      exit 1
+    else
+      echo "Successfully processed $FILE with metric: $METRIC"
+    fi
+  done
 done
 
-echo "All datasets processed successfully."
+echo "All datasets processed successfully for all metrics."
diff --git a/scripts/experiments/update_metric.sh b/scripts/experiments/update_metric.sh
@@ -11,9 +11,9 @@ MULTILABEL="$2"
 
 # Determine the correct configuration file based on the multilabel argument
 if [ "$MULTILABEL" == "true" ]; then
-  CONFIG_PATH="../../autointent/datafiles/default-multilabel-config.yaml"
+  CONFIG_PATH="autointent/datafiles/default-multilabel-config.yaml"
 elif [ "$MULTILABEL" == "false" ]; then
-  CONFIG_PATH="../../autointent/datafiles/default-multiclass-config.yaml"
+  CONFIG_PATH="autointent/datafiles/default-multiclass-config.yaml"
 else
   echo "Invalid value for <multilabel>. Use 'true' or 'false'."
   exit 1