Allow faster prototyping and add Cuda 11 dockerfile

Indraneil Paul · Indraneil Paul · commit 158327f1893f · 2024-05-25T17:13:10.000+02:00
diff --git a/Evaluate.Dockerfile b/Evaluate.Dockerfile
diff --git a/Generate.Dockerfile b/Generate.Dockerfile
@@ -0,0 +1,144 @@
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+
+SHELL ["/bin/bash", "-c"]
+
+# Setup Environment Variables
+ENV CUDA_HOME=/usr/local/cuda \
+    PYTHONUNBUFFERED=1 \
+    TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+
+# Setup System Utilities
+RUN apt-get update --yes --quiet \
+    && apt-get upgrade --yes --quiet \
+    && DEBIAN_FRONTEND=noninteractive apt-get install --yes --quiet --no-install-recommends \
+        apt-utils \
+        autoconf \
+        automake \
+        bc \
+        build-essential \
+        ca-certificates \
+        check \
+        cmake \
+        curl \
+        dmidecode \
+        emacs \
+        g++\
+        gcc \
+        git \
+        iproute2 \
+        jq \
+        kmod \
+        libaio-dev \
+        libcurl4-openssl-dev \
+        libgl1-mesa-glx \
+        libglib2.0-0 \
+        libgomp1 \
+        libibverbs-dev \
+        libnuma-dev \
+        libnuma1 \
+        libomp-dev \
+        libsm6 \
+        libssl-dev \
+        libsubunit-dev \
+        libsubunit0 \
+        libtool \
+        libxext6 \
+        libxrender-dev \
+        make \
+        moreutils \
+        net-tools \
+        ninja-build \
+        openssh-client \
+        openssh-server \
+        openssl \
+        pkg-config \
+        python3-dev \
+        software-properties-common \
+        sudo \
+        unzip \
+        util-linux \
+        vim \
+        wget \
+        zlib1g-dev \
+    && apt-get autoremove \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/
+
+# Setup base Python to bootstrap Mamba
+RUN add-apt-repository --yes ppa:deadsnakes/ppa \
+    && apt-get update --yes --quiet
+RUN DEBIAN_FRONTEND=noninteractive apt-get install --yes --quiet --no-install-recommends \
+        python3.11 \
+        python3.11-dev \
+        python3.11-distutils \
+        python3.11-lib2to3 \
+        python3.11-gdbm \
+        python3.11-tk \
+        pip
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 999 \
+    && update-alternatives --config python3 \
+    && ln -s /usr/bin/python3 /usr/bin/python
+RUN pip install --upgrade pip
+
+# Setup optimized Mamba environment with required PyTorch dependencies
+RUN wget -O /tmp/Miniforge.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Mambaforge-24.3.0-0-Linux-x86_64.sh \
+    && bash /tmp/Miniforge.sh -b -p /Miniforge \
+    && source /Miniforge/etc/profile.d/conda.sh \
+    && source /Miniforge/etc/profile.d/mamba.sh \
+    && mamba update -y -q -n base -c defaults mamba \
+    && mamba create -y -q -n Code-Eval python=3.11 setuptools=69.5.1 \
+    && mamba activate Code-Eval \
+    && mamba install -y -q -c conda-forge \
+        charset-normalizer \
+        gputil \
+        ipython \
+        numpy \
+        pandas \
+        scikit-learn \
+        wandb \
+    && mamba install -y -q -c intel \
+        "mkl==2023" \
+        "mkl-static==2023" \
+        "mkl-include==2023" \
+    && mamba install -y -q -c pytorch magma-cuda118 \
+    && mamba clean -a -f -y
+
+# Install VLLM precompiled with appropriate CUDA and ensure PyTorch is installed form the same version channel
+RUN source /Miniforge/etc/profile.d/conda.sh \
+    && source /Miniforge/etc/profile.d/mamba.sh \
+    && mamba activate Code-Eval \
+    && pip install https://github.com/vllm-project/vllm/releases/download/v0.4.0/vllm-0.4.0+cu118-cp311-cp311-manylinux1_x86_64.whl \
+        --extra-index-url https://download.pytorch.org/whl/cu118
+
+# Install Flash Attention
+RUN source /Miniforge/etc/profile.d/conda.sh \
+    && source /Miniforge/etc/profile.d/mamba.sh \
+    && mamba activate Code-Eval \
+    && export MAX_JOBS=$(($(nproc) - 2)) \
+    && pip install --no-cache-dir ninja packaging psutil \
+    && pip install flash-attn==2.5.8 --no-build-isolation
+
+# Add a new user "wildcodeuser"
+RUN adduser --disabled-password --gecos "" wildcodeuser
+
+# Acquire benchmark code to local
+RUN git clone https://github.com/NVIDIA/apex /wildcode
+
+RUN chown -R wildcodeuser:wildcodeuser /wildcode
+USER wildcodeuser
+
+# Install Code-Eval and pre-load the dataset
+RUN source /Miniforge/etc/profile.d/conda.sh \
+    && source /Miniforge/etc/profile.d/mamba.sh \
+    && mamba activate Code-Eval \
+    && pip install wild-code --upgrade \
+    && python -c "from wildcode.data import get_wildcodebench; get_wildcodebench()"
+
+WORKDIR /wildcode
+
+# Declare an argument for the huggingface token
+ARG HF_TOKEN
+RUN if [[ -n "$HF_TOKEN" ]] ; then /Miniforge/envs/Code-Eval/bin/huggingface-cli login --token $HF_TOKEN ; \
+    else echo "No HuggingFace token specified. Access to gated or private models will be unavailable." ; fi
+
+ENTRYPOINT ["/Miniforge/envs/Code-Eval/bin/python", "-m", "wildcode.generate"]
diff --git a/Requirements/requirements-eval.txt b/Requirements/requirements-eval.txt
@@ -0,0 +1,74 @@
+beautifulsoup4==4.8.2
+blake3==0.4.1
+chardet==5.2.0
+cryptography==38.0.0
+datetime==5.5
+Django==4.2.7
+dnspython==2.6.1
+docxtpl==0.11.5
+Faker==20.1.0
+flask_login==0.6.3
+flask_restful==0.3.10
+flask_wtf==1.2.1
+Flask-Mail==0.9.1
+flask==3.0.3
+folium==0.16.0
+gensim==4.3.2
+geopandas==0.13.2
+geopy==2.4.1
+holidays==0.29
+keras==2.11.0
+Levenshtein==0.25.0
+librosa==0.10.1
+lxml==4.9.3
+matplotlib==3.7.0
+mechanize==0.4.9
+natsort==7.1.1
+networkx==2.6.3
+nltk==3.8
+numba==0.55.0
+numpy==1.21.2
+opencv-python-headless==4.9.0.80
+openpyxl==3.1.2
+pandas==2.0.3
+Pillow==10.3.0
+prettytable==3.10.0
+psutil==5.9.5
+pycryptodome==3.14.1
+pyfakefs==5.4.1
+pyquery==1.4.3
+pytesseract==0.3.10
+pytest==8.2.0
+python_http_client==3.3.7
+python-dateutil==2.9.0
+python-docx==1.1.0
+python-Levenshtein-wheels
+pytz==2023.3.post1
+PyYAML==6.0.1
+requests_mock==1.11.0
+requests==2.31.0
+Requests==2.31.0
+rsa==4.9
+scikit-image==0.18.0
+scikit-learn==1.3.1
+scipy==1.7.2
+seaborn==0.13.2
+selenium==4.15.
+sendgrid==6.11.0
+shapely==2.0.4
+soundfile==0.12.1
+statsmodels==0.14.0
+statsmodels==0.14.0
+sympy==1.12
+tensorflow==2.11.1
+textblob==0.18.0
+texttable==1.7.0
+Werkzeug==3.0.1
+wikipedia==1.4.0
+wordcloud==1.9.3
+wordninja==2.0.0
+WTForms==3.1.2
+xlrd==2.0.1
+xlrd==2.0.1
+xlwt==1.3.0
+xmltodict==0.13.0
diff --git a/Requirements/requirements.txt b/Requirements/requirements.txt
@@ -0,0 +1,18 @@
+accelerate>=0.30.1
+anthropic>=0.26.1
+appdirs>=1.4.4
+fire>=0.6.0
+google-generativeai>=0.5.4
+mistralai>=0.2.0
+multipledispatch>=0.6.0
+numpy>=1.19.5
+openai>=1.11.1
+Pympler>=1.0.1
+rich>=12.3.0
+stop-sequencer>=1.2.3
+tempdir>=0.7.1
+termcolor>=2.0.0
+tqdm>=4.56.0
+tree_sitter_languages>=1.10.2
+tree-sitter==0.21.3
+wget>=3.2e
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.cfg b/setup.cfg
@@ -17,26 +17,25 @@ packages = find:
 python_requires = >=3.8
 dependency_links =
 install_requires =
-    wget>=3.2
-    tempdir>=0.7.1
-    multipledispatch>=0.6.0
+    accelerate>=0.30.1
+    anthropic>=0.26.1
     appdirs>=1.4.4
-    numpy>=1.19.5
-    tqdm>=4.56.0
-    termcolor>=2.0.0
     fire>=0.6.0
+    google-generativeai>=0.5.4
+    mistralai>=0.2.0
+    multipledispatch>=0.6.0
+    numpy>=1.19.5
     openai>=1.11.1
+    Pympler>=1.0.1
     rich>=12.3.0
+    stop-sequencer>=1.2.3
+    tempdir>=0.7.1
+    termcolor>=2.0.0
+    tqdm>=4.56.0
     tree_sitter_languages>=1.10.2
     tree-sitter==0.21.3
-    Pympler>=1.0.1
-    accelerate
-    vllm
-    anthropic
-    mistralai
-    stop-sequencer
-    google-generativeai
-    
+    wget>=3.2
+
 [options.entry_points]
 console_scripts =
     wildcode.evaluate = wildcode.evaluate:main
diff --git a/wildcode/generate.py b/wildcode/generate.py
@@ -1,8 +1,6 @@
 import os
 import json
 import argparse
-from os import PathLike
-from typing import List
 
 from wildcode.model import DecoderBase, make_model
 from rich.progress import (
@@ -24,6 +22,7 @@ def codegen(
     n_samples=1,
     id_range=None,
     resume=True,
+    subsample_size=None,
 ):
     with Progress(
         TextColumn(f"{dataset} •" + "[progress.percentage]{task.percentage:>3.0f}%"),
@@ -36,10 +35,13 @@ def codegen(
             from wildcode.data import get_wildcodebench, write_jsonl
 
             dataset = get_wildcodebench()
+            if subsample_size:
+                if subsample_size < len(dataset):
+                    dataset = dataset[:subsample_size]
 
         if model.is_direct_completion() and nl2code:
             raise Exception("Base model does not support direct completion for NL2Code tasks")
-            
+
         # create save_path if it doesn't exist, e.g., a/b.jsonl
         dirname = os.path.dirname(save_path)
         if not os.path.exists(dirname) and dirname != "":
@@ -53,7 +55,7 @@ def codegen(
                     continue
 
             p_name = task_id.replace("/", "_")
-            
+
             # read the existing file if save_path exists
             if os.path.exists(save_path):
                 with open(save_path, "r") as f:
@@ -103,12 +105,14 @@ def codegen(
                 print(f"Generated {len(samples)} samples")
                 write_jsonl(save_path, samples, append=True)
                 sidx += len(outputs)
-                
+
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", required=True, type=str)
     parser.add_argument("--dataset", required=True, type=str)
+    parser.add_argument("--save_path", default=None, type=str)
+    parser.add_argument("--subsample_size", default=None, type=int)
     parser.add_argument("--nl2code", action='store_true')
     parser.add_argument("--bs", default=1, type=int)
     parser.add_argument("--n_samples", default=1, type=int)
@@ -121,8 +125,8 @@ def main():
     parser.add_argument("--base_url", default=None, type=str)
     parser.add_argument("--tp", default=1, type=int)
     args = parser.parse_args()
-    
-    
+
+
     assert args.dataset in ["wildcodebench"], f"Invalid dataset {args.dataset}"
     assert args.backend in ["vllm", "hf", "openai", "mistral", "anthropic", "google"]
 
@@ -153,8 +157,12 @@ def main():
         task = "nl2c"
     else:
         task = "c2c"
-    save_path = args.model.replace("/", "--") + f"--{args.dataset}-{task}--{args.backend}-{args.temperature}-{args.n_samples}.jsonl"
-    
+
+    if not args.save_path:
+        save_path = args.model.replace("/", "--") + f"--{args.dataset}-{task}--{args.backend}-{args.temperature}-{args.n_samples}.jsonl"
+    else:
+        save_path = args.save_path
+
     codegen(
         model=model_runner,
         save_path=save_path,
@@ -165,6 +173,7 @@ def main():
         n_samples=args.n_samples,
         resume=args.resume,
         id_range=args.id_range,
+        subsample_size=args.subsample_size,
     )