nerdy-tech-com-gitub
diff --git a/‎.github/scripts/spellcheck_conf/wordlist.txt
Lines changed: 19 additions & 1 deletion b/‎.github/scripts/spellcheck_conf/wordlist.txt
Lines changed: 19 additions & 1 deletion
diff --git a/‎recipes/use_cases/multilingual/README.md
Lines changed: 2 additions & 1 deletion b/‎recipes/use_cases/multilingual/README.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎requirements.txt
Lines changed: 1 addition & 1 deletion b/‎requirements.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎tools/benchmarks/llm_eval_harness/README.md
Lines changed: 4 additions & 0 deletions b/‎tools/benchmarks/llm_eval_harness/README.md
Lines changed: 4 additions & 0 deletions
diff --git a/‎tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md
Lines changed: 213 additions & 0 deletions b/‎tools/benchmarks/llm_eval_harness/meta_eval_reproduce/README.md
Lines changed: 213 additions & 0 deletions
diff --git a/‎tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml
Lines changed: 32 additions & 0 deletions b/‎tools/benchmarks/llm_eval_harness/meta_eval_reproduce/eval_config.yaml
Lines changed: 32 additions & 0 deletions
diff --git a/‎tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml
Lines changed: 28 additions & 0 deletions b/‎tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/bbh_3shot_cot.yaml
Lines changed: 28 additions & 0 deletions
diff --git a/‎tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/utils.py
Lines changed: 21 additions & 0 deletions b/‎tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/bbh/utils.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
Lines changed: 29 additions & 0 deletions b/‎tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/gpqa_0shot_cot.yaml
Lines changed: 29 additions & 0 deletions
diff --git a/‎tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/utils.py
Lines changed: 20 additions & 0 deletions b/‎tools/benchmarks/llm_eval_harness/meta_eval_reproduce/meta_template/gpqa_cot/utils.py
Lines changed: 20 additions & 0 deletions
@@ -1432,4 +1432,22 @@ CPUs
 modelUpgradeExample
 guardrailing
 MaaS
-MFU
+MFU
+BBH
+GPQA
+IFEVAL
+IFeval
+bos
+gpqa
+ifeval
+lighteval
+sqrt
+wis
+evals
+mmlu
+parsers
+reproducibility
+openhathi
+sarvam
+subtask
+acc
@@ -1,7 +1,8 @@
 # Extending Llama to a new language
 Authored by : Sarvam team
 In this recipe, we will see how to add a new language to the Llama family of models. The steps are quite general and can be easily adapted to other models as well. Using this recipe, you should be able to replicate the findings of [OpenHathi](https://huggingface.co/sarvamai/OpenHathi-7B-Hi-v0.1-Base).
-Please read more about OpenHathi [here](https://analyticsindiamag.com/industry-insights/ai-startups/indian-startup-sarvam-ai-launches-hindi-llm-openhathi/)
+Please read more about OpenHathi [here](https://web.archive.org/web/20240418103408/https://www.sarvam.ai/blog/announcing-openhathi-series)
+
 ## Data
 The original OpenHathi model uses a combination of [Sangraha](https://huggingface.co/datasets/ai4bharat/sangraha) and Wikipedia as its primary data sources. If the reader is interested in using these sources, they would also have to preprocess the data: clean, filter, and deduplicate. See [Setu](https://github.com/AI4Bharat/setu) for an easy way to do this at scale.
 
 
@@ -28,4 +28,4 @@ langchain_openai
 langchain
 langchain_community
 sentence_transformers
-codeshield
+codeshield
@@ -62,6 +62,10 @@ There has been an study from [IBM on efficient benchmarking of LLMs](https://arx
 python eval.py --model hf --model_args pretrained=meta-llama/Meta-Llama-3.1-8B,dtype="float",peft=../peft_output --tasks hellaswag --num_fewshot 10  --device cuda:0 --batch_size 8 --limit 100
 ```
 
+### Reproducing Meta 3.1 Evaluation Metrics Using LM-Evaluation-Harness
+
+[meta_eval_reproduce](./meta_eval_reproduce/) folder provides a detailed guide on how to reproduce the Meta Llama 3.1 evaluation metrics reported in our [Meta Llama website](https://llama.meta.com/) using the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main) and our [3.1 evals Huggingface collection](https://huggingface.co/collections/meta-llama/llama-31-evals-66a2c5a14c2093e58298ac7f). By following the steps outlined, users can replicate a evaluation process that is similar to Meta's, for specific tasks and compare their results with our reported metrics. While slight variations in results are expected due to differences in implementation and model behavior, we aim to provide a transparent and reproducible method for evaluating Meta Llama 3 models using third party library. Please check the [README.md](./meta_eval_reproduce/README.md) for more details.
+
 ### Reproducing Hugging Face Open-LLM-Leaderboard
 
 Here, we provided a list of tasks from `Open-LLM-Leaderboard` which can be used by passing `--open-llm-leaderboard-tasks` instead of `tasks` to the `eval.py`.
 
@@ -0,0 +1,32 @@
+model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
+
+evals_dataset: "meta-llama/Meta-Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid Meta Llama 3.1 evals dataset name in the Llama 3.1 Evals collection.
+# Must be one of the following ["meta-llama/Meta-Llama-3.1-8B-Instruct-evals","meta-llama/Meta-Llama-3.1-70B-Instruct-evals","meta-llama/Meta-Llama-3.1-405B-Instruct-evals","meta-llama/Meta-Llama-3.1-8B-evals","meta-llama/Meta-Llama-3.1-70B-evals","meta-llama/Meta-Llama-3.1-405B-evals"]
+
+tasks: "meta_instruct" # Available tasks for instruct model: "meta_math_hard", "meta_gpqa", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
+# Available tasks for pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
+
+tensor_parallel_size: 1 # The VLLM argument that speicify the tensor parallel size for the model, eg how many GPUs to use for a model copy.
+
+data_parallel_size: 4 # The VLLM argument that speicify the data parallel size for the model, eg how copies of model will be used.
+
+gpu_memory_utilization: 0.9 #The VLLM argument that speicify gpu memory utilization, the rest will be reserved for KV cache.
+
+max_model_len: 8192 #The VLLM argument that speicify model max length, decrease this value only if GPU memory issue encountered. Please make sure the max_gen_toks in the yaml does not exceed this length.
+
+batch_size: "auto" # Batch size, can be 'auto', 'auto:N', or an integer. It is strongly recommend to use 'auto' for vllm to speed up the inference
+
+output_path: "eval_results" # the output folder to store all the eval results and samples.
+
+#limit: 12 # Limit number of examples per task, set 'null' to run all.
+limit: null # Limit number of examples per task, set 'null' to run all.
+
+verbosity: "INFO" #Logging level: CRITICAL, ERROR, WARNING, INFO, DEBUG.
+
+log_samples: true # If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis.
+
+work_dir: ./work_dir # The work folder where the task template yaml files will be copied and modified, datasets will be downloaded for math_hard, ifeval.
+
+template_dir: ./meta_template #Path to the folder that contains all the meta task templates
+
+show_config: false # If True, shows the full config of all tasks at the end of the evaluation.
@@ -0,0 +1,28 @@
+dataset_path: meta-llama/Meta-Llama-3.1-8B-evals
+dataset_name: Meta-Llama-3.1-8B-evals__bbh__details
+task: meta_bbh
+output_type: generate_until
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: answer
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: 'the answer is (.*?)\.'
+      - function: "take_first"
+generation_kwargs:
+  until: "\n\nQ: "
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 512
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
@@ -0,0 +1,21 @@
+import random
+import re
+
+import datasets
+
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "answer": doc["input_correct_responses"][0],
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)
@@ -0,0 +1,29 @@
+dataset_path: meta-llama/Meta-Llama-3.1-8B-Instruct-evals
+dataset_name: Meta-Llama-3.1-8B-Instruct-evals__gpqa__details
+task: meta_gpqa
+output_type: generate_until
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 2048
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
@@ -0,0 +1,20 @@
+import random
+import re
+
+import datasets
+
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)