amazon-science
diff --git a/‎README.md‎
Lines changed: 68 additions & 7 deletions b/‎README.md‎
Lines changed: 68 additions & 7 deletions
diff --git a/‎evaluations.py‎
Lines changed: 100 additions & 0 deletions b/‎evaluations.py‎
Lines changed: 100 additions & 0 deletions
@@ -1,17 +1,78 @@
-## My Project
+# Multimodal Chain-of-Thought Reasoning in Language Models
 
-TODO: Fill this README out!
+Multimodal-CoT incorporates vision features in a decoupled training framework. The framework consists of two training stages: (i) rationale generation and (ii) answer inference. Both stages share the same model architecture but differ in the input and output.
 
-Be sure to:
+![](vision_features/mm-cot.png)
 
-* Change the title in this README
-* Edit your repository description on GitHub
 
-## Security
+## Requirements
 
-See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
+Install all required python dependencies:
+
+```
+pip install -r requirements.txt
+```
+
+## Datasets
+
+Download the datasets from the following:
+
+```
+https://github.com/lupantech/ScienceQA/tree/main/data
+```
+
+Download the extracted vision fearures from [Anonymous](xxx) and unzip the files under `vision_features`
+
+## Instructions
+
+### Training 
+
+```
+# rationale generation
+CUDA_VISIBLE_DEVICES=0,1 python main.py \
+    --model allenai/unifiedqa-t5-base \
+    --user_msg rationale --img_type detr \
+    --bs 8 --eval_bs 4 --eval_acc 10 --output_len 512 \
+    --final_eval --prompt_format QCM-LE
+
+# answer inference
+CUDA_VISIBLE_DEVICES=0,1 python main.py \
+    --model allenai/unifiedqa-t5-base \
+    --user_msg answer --img_type detr \
+    --bs 8 --eval_bs 4 --eval_acc 10 --output_len 64 \
+    --final_eval --prompt_format QCMG-A \
+    --eval_le experiments/rationale_allenai-unifiedqa-t5-base_detr_QCM-LE_lr5e-05_bs16_op512_ep20/predictions_ans_eval.json \
+    --test_le experiments/rationale_allenai-unifiedqa-t5-base_detr_QCM-LE_lr5e-05_bs16_op512_ep20/predictions_ans_test.json
+```
+
+### Inference 
+
+Our trained models are available at [Anonymous](xxx). To use our trained models, please put the them under the ```models``` folder.
+
+```
+# rationale generation
+CUDA_VISIBLE_DEVICES=0,1 python main.py \
+    --model allenai/unifiedqa-t5-base \
+    --user_msg rationale --img_type detr \
+    --bs 8 --eval_bs 4 --eval_acc 10 --output_len 512 \
+    --final_eval --prompt_format QCM-LE \
+    --evaluate_dir models/rationale
+
+# answer inference
+CUDA_VISIBLE_DEVICES=0,1 python main.py \
+    --model allenai/unifiedqa-t5-base \
+    --user_msg answer --img_type detr \
+    --bs 8 --eval_bs 4 --eval_acc 10 --output_len 64 \
+    --final_eval --prompt_format QCMG-A \
+    --eval_le models/rationale/predictions_ans_eval.json \
+    --test_le models/rationale/predictions_ans_test.json \
+    --evaluate_dir models/answer
+```
 
 ## License
 
 This project is licensed under the Apache-2.0 License.
 
+## Acknowledgement
+
+Part of our codes are adapted from [ScienceQA](https://github.com/lupantech/ScienceQA) and [Transformers](https://github.com/huggingface/transformers).
@@ -0,0 +1,100 @@
+'''
+Adapted from https://github.com/lupantech/ScienceQA
+'''
+
+import re
+from rouge import Rouge
+from nltk.translate.bleu_score import sentence_bleu
+from sentence_transformers import util
+
+########################
+## BLEU
+########################
+def tokenize(text):
+    tokens = re.split(r'\s|\.', text)
+    tokens = [t for t in tokens if len(t) > 0]
+    return tokens
+
+
+def bleu_score(reference, hypothesis, gram):
+    reference_tokens = tokenize(reference)
+    hypothesis_tokens = tokenize(hypothesis)
+
+    if gram == 1:
+        bleu = sentence_bleu([reference_tokens], hypothesis_tokens, (1., ))  # BELU-1
+    elif gram == 2:
+        bleu = sentence_bleu([reference_tokens], hypothesis_tokens, (1. / 2., 1. / 2.))  # BELU-2
+    elif gram == 3:
+        bleu = sentence_bleu([reference_tokens], hypothesis_tokens, (1. / 3., 1. / 3., 1. / 3.))  # BELU-3
+    elif gram == 4:
+        bleu = sentence_bleu([reference_tokens], hypothesis_tokens, (1. / 4., 1. / 4., 1. / 4., 1. / 4.))  # BELU-4
+
+    return bleu
+
+
+def caculate_bleu(results, data, gram):
+    bleus = []
+    for qid, output in results.items():
+        prediction = output
+        target = data[qid]
+        target = target.strip()
+        if target == "":
+            continue
+        bleu = bleu_score(target, prediction, gram)
+        bleus.append(bleu)
+
+    avg_bleu = sum(bleus) / len(bleus)
+
+    return avg_bleu
+
+
+########################
+## Rouge-L
+########################
+def score_rouge(str1, str2):
+    rouge = Rouge(metrics=["rouge-l"])
+    scores = rouge.get_scores(str1, str2, avg=True)
+    rouge_l = scores['rouge-l']['f']
+    return rouge_l
+
+
+def caculate_rouge(results, data):
+    rouges = []
+    for qid, output in results.items():
+        prediction = output
+        target = data[qid]
+        target = target.strip()
+        if prediction == "":
+            continue
+        if target == "":
+            continue
+        rouge = score_rouge(target, prediction)
+        rouges.append(rouge)
+
+    avg_rouge = sum(rouges) / len(rouges)
+    return avg_rouge
+
+
+########################
+## Sentence Similarity
+########################
+def similariry_score(str1, str2, model):
+    # compute embedding for both lists
+    embedding_1 = model.encode(str1, convert_to_tensor=True)
+    embedding_2 = model.encode(str2, convert_to_tensor=True)
+    score = util.pytorch_cos_sim(embedding_1, embedding_2).item()
+    return score
+
+
+def caculate_similariry(results, data, model):
+    scores = []
+    for qid, output in results.items():
+        prediction = output
+        target = data[qid]
+        target = target.strip()
+
+        score = similariry_score(target, prediction, model)
+        scores.append(score)
+
+    avg_score = sum(scores) / len(scores)
+    return avg_score