Small QOL changes (#7)

patrickhyw · Ubuntu · Patrick Wang · web-flow · commit a5b8fed23b6b · 2024-04-03T09:37:02.000+08:00
* fixed transformers version * installing small dataset directly * now loading dataset properly * fixed to make datagen work on gpu * readme typo * rem print in get datastore data * added fschat back to req * Revert "added fschat back to req" This reverts commit b8366db. * add fs chat to req without removing data/ * main.rs * modified some prints * rm warnings * wrote code to compute total three/four grams and write to a file * moved stuff to exp * removed extra stuff * reverted get_datastore_chat.py * gitignore * lib.rs comments --------- Co-authored-by: Ubuntu <ubuntu@ip-172-31-21-233.ec2.internal> Co-authored-by: Patrick Wang <phw2@cs.cmu.edu> Co-authored-by: Ubuntu <ubuntu@ip-172-31-19-110.ec2.internal>
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# Generated files
+*.idx
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/DraftRetriever/src/lib.rs b/DraftRetriever/src/lib.rs
@@ -1,9 +1,7 @@
 // The code for retrival is adapted from https://github.com/Intsights/PySubstringSearch; 
 // The code for drafft buffer is adapted from https://github.com/FasterDecoding/Medusa/blob/main/medusa/model/utils.py#L31-L124
 use ahash::AHashSet;
-use bstr::io::BufReadExt;
 use byteorder::{ReadBytesExt, WriteBytesExt, ByteOrder, LittleEndian};
-use memchr::memmem;
 use parking_lot::Mutex;
 use pyo3::exceptions;
 use pyo3::prelude::*;
@@ -21,7 +19,6 @@ use pyo3::types::PyList;
 use std::collections::BinaryHeap;
 use std::fs;
 use std::io::Cursor;
-use std::fs::OpenOptions;
 
 extern "C" {
     pub fn libsais_int(
@@ -217,6 +214,7 @@ impl Reader {
         long: Option<i32>,
     ) -> PyResult<(Vec<Vec<i32>>, Vec<Vec<i32>>, Vec<i32>, Vec<i32>, Vec<Vec<i32>>)> {
 
+        // substring_i32 is just a rust version of py_substring
         let mut substring_i32 = Vec::new();
         for item in py_substring.iter() {
             let num: i32 = item.extract()?;
@@ -225,19 +223,28 @@ impl Reader {
 
         let results = Arc::new(Mutex::new(Vec::new()));
 
+        // each sub index is a buffer/suffix pair
         self.sub_indexes.par_iter_mut().for_each(
             |sub_index| {
                 let mut start_of_indices = None;
                 let mut end_of_indices = None;
 
+                // since suffix arrays have the suffixes in sorted order, we do a binary search
+                // over the suffix array
+                // this binary search finds the start of the matching suffixes
                 let mut left_anchor = sub_index.suffixes_file_start;
                 let mut right_anchor = sub_index.suffixes_file_end - 4;
                 while left_anchor <= right_anchor {
                     let middle_anchor = left_anchor + ((right_anchor - left_anchor) / 4 / 2 * 4);
                     sub_index.index_file.seek(SeekFrom::Start(middle_anchor as u64)).unwrap();
+                    // data_index is the value at middle_anchor in the suffix array
                     let data_index = sub_index.index_file.read_i32::<LittleEndian>().unwrap();
+                    // line is the actual suffix
                     let line = &sub_index.data[(data_index) as usize..];
 
+                    // we don't use the entire suffix. we look for suffixes that start with the substring we're looking for
+                    // the suffix array sorts suffixes based on the start of the suffix, so this technique is sound
+                    // the "match length" is defined by the length of substring_i32. the suffix array doesn't need to worry about "match length"
                     if line.starts_with(&substring_i32) {
                         start_of_indices = Some(middle_anchor);
                         right_anchor = middle_anchor - 4;
@@ -253,6 +260,7 @@ impl Reader {
                     return;
                 }
                 
+                // this binary search finds the end of the matching suffixes
                 let mut right_anchor = sub_index.suffixes_file_end - 4;
                 while left_anchor <= right_anchor {
                     let middle_anchor = left_anchor + ((right_anchor - left_anchor) / 4 / 2 * 4);
diff --git a/README.md b/README.md
@@ -93,7 +93,7 @@ python3 get_datastore_code.py --model-path codellama/CodeLlama-7b-instruct-hf --
 ### Inference on MT-Bench
 ```bash
 cd llm_judge
-RAYON_NUM_THREADS=6 CUDA_VISIBLE_DEVICES=0 python3 get_model_answer_rest.py --model-path lmsys/vicuna-7b-v1.5 --model-id vicuna-7b-v1.5 --datastore-path ../datastore/datastore_chat_small.idx
+RAYON_NUM_THREADS=6 CUDA_VISIBLE_DEVICES=0 python3 gen_model_answer_rest.py --model-path lmsys/vicuna-7b-v1.5 --model-id vicuna-7b-v1.5 --datastore-path ../datastore/datastore_chat_small.idx
 ```
 
 ### Inference on HumanEval
diff --git a/datastore/get_datastore_chat.py b/datastore/get_datastore_chat.py
@@ -52,4 +52,3 @@
             writer.add_entry(token_list)
 
 writer.finalize()
-
diff --git a/llm_judge/gen_model_answer_rest.py b/llm_judge/gen_model_answer_rest.py
@@ -269,7 +269,7 @@ def get_model_answers(
                 if conv.name == "xgen" and output.startswith("Assistant:"):
                     output = output.replace("Assistant:", "", 1).strip()
             except RuntimeError as e:
-                print("ERROR question ID: ", question["question_id"])
+                print(f"question ID {question['question_id']} errored out with {e}")
                 output = "ERROR"
 
             turns.append(output)
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,13 @@
 torch
-"fschat[model_worker,webui]"
+fschat[model_worker,webui]
 maturin==0.12
 numpy==1.26.1
 tqdm==4.66.1
-transformers==4.34.1
+transformers
 accelerate==0.24.1
 datasets
 openai
-anthropic
+anthropic
+sentencepiece
+protobuf
+shortuuid

Original file line number	Diff line number	Diff line change
`@@ -52,4 +52,3 @@`
`52`	`52`	`writer.add_entry(token_list)`
`53`	`53`
`54`	`54`	`writer.finalize()`
`55`		`-`