|
| 1 | +"""CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation |
| 2 | +https://arxiv.org/abs/2102.04664 |
| 3 | +
|
| 4 | +Code-to-Code Translation task from CodeXGLUE: |
| 5 | +Translating code between Java and C# programming languages. |
| 6 | +The dataset is collected from several public repos including Lucene, POI, JGit and Antlr. |
| 7 | +
|
| 8 | +Dataset: https://huggingface.co/datasets/google/code_x_glue_cc_code_to_code_trans |
| 9 | +- 10,300 training samples |
| 10 | +- 500 validation samples |
| 11 | +- 1,000 test samples |
| 12 | +
|
| 13 | +This is a zero-shot or few-shot task evaluated with CodeBLEU score. |
| 14 | +CodeBLEU is a metric specifically designed for code generation that considers: |
| 15 | +- N-gram matching (like BLEU) |
| 16 | +- Weighted n-gram matching based on syntax |
| 17 | +- Syntax match using AST |
| 18 | +- Dataflow match for semantic similarity |
| 19 | +
|
| 20 | +Reference: https://arxiv.org/abs/2009.10297 |
| 21 | +""" |
| 22 | +import json |
| 23 | + |
| 24 | +from codebleu import calc_codebleu |
| 25 | + |
| 26 | +from bigcode_eval.base import Task |
| 27 | + |
| 28 | +_CITATION = """ |
| 29 | +@article{DBLP:journals/corr/abs-2102-04664, |
| 30 | + author = {Shuai Lu and |
| 31 | + Daya Guo and |
| 32 | + Shuo Ren and |
| 33 | + Junjie Huang and |
| 34 | + Alexey Svyatkovskiy and |
| 35 | + Ambrosio Blanco and |
| 36 | + Colin B. Clement and |
| 37 | + Dawn Drain and |
| 38 | + Daxin Jiang and |
| 39 | + Duyu Tang and |
| 40 | + Ge Li and |
| 41 | + Lidong Zhou and |
| 42 | + Linjun Shou and |
| 43 | + Long Zhou and |
| 44 | + Michele Tufano and |
| 45 | + Ming Gong and |
| 46 | + Ming Zhou and |
| 47 | + Nan Duan and |
| 48 | + Neel Sundaresan and |
| 49 | + Shao Kun Deng and |
| 50 | + Shengyu Fu and |
| 51 | + Shujie Liu}, |
| 52 | + title = {CodeXGLUE: {A} Machine Learning Benchmark Dataset for Code Understanding |
| 53 | + and Generation}, |
| 54 | + journal = {CoRR}, |
| 55 | + volume = {abs/2102.04664}, |
| 56 | + year = {2021} |
| 57 | +} |
| 58 | +""" |
| 59 | + |
| 60 | +# Translation directions supported |
| 61 | +# Note: codebleu_lang uses the language identifier expected by the codebleu package |
| 62 | +TRANSLATION_DIRECTIONS = { |
| 63 | + "java_cs": { |
| 64 | + "source": "java", |
| 65 | + "target": "cs", |
| 66 | + "source_name": "Java", |
| 67 | + "target_name": "C#", |
| 68 | + "codebleu_lang": "c_sharp", # Target language for CodeBLEU evaluation |
| 69 | + }, |
| 70 | + "cs_java": { |
| 71 | + "source": "cs", |
| 72 | + "target": "java", |
| 73 | + "source_name": "C#", |
| 74 | + "target_name": "Java", |
| 75 | + "codebleu_lang": "java", # Target language for CodeBLEU evaluation |
| 76 | + }, |
| 77 | +} |
| 78 | + |
| 79 | + |
| 80 | +def create_all_tasks(): |
| 81 | + """Creates a dictionary of tasks for both translation directions. |
| 82 | + :return: {task_name: task} |
| 83 | + e.g. {codexglue_code_to_code_trans-java_cs: Task, codexglue_code_to_code_trans-cs_java: Task} |
| 84 | + """ |
| 85 | + return { |
| 86 | + f"codexglue_code_to_code_trans-{direction}": create_task(direction) |
| 87 | + for direction in TRANSLATION_DIRECTIONS |
| 88 | + } |
| 89 | + |
| 90 | + |
| 91 | +def create_task(direction): |
| 92 | + class CodeToCodeTransTask(CodeToCodeTrans): |
| 93 | + def __init__(self, **kwargs): |
| 94 | + super().__init__(direction, **kwargs) |
| 95 | + |
| 96 | + return CodeToCodeTransTask |
| 97 | + |
| 98 | + |
| 99 | +class CodeToCodeTrans(Task): |
| 100 | + """Code-to-Code Translation task for Java ↔ C# translation. |
| 101 | + |
| 102 | + A task represents an entire benchmark including its dataset, problems, |
| 103 | + answers, generation settings and evaluation methods. |
| 104 | + """ |
| 105 | + |
| 106 | + DATASET_PATH = "code_x_glue_cc_code_to_code_trans" |
| 107 | + DATASET_NAME = None |
| 108 | + |
| 109 | + def __init__(self, direction): |
| 110 | + """Initialize the code translation task. |
| 111 | + |
| 112 | + :param direction: str |
| 113 | + Translation direction, either 'java_cs' or 'cs_java' |
| 114 | + """ |
| 115 | + self.direction = direction |
| 116 | + self.direction_config = TRANSLATION_DIRECTIONS[direction] |
| 117 | + super().__init__( |
| 118 | + stop_words=["\n\n", "\n//", "\n/*", "\n#"], # Stop at blank lines or comments |
| 119 | + requires_execution=False, |
| 120 | + ) |
| 121 | + |
| 122 | + def get_dataset(self): |
| 123 | + """Returns dataset for the task or an iterable of any object, that get_prompt can handle.""" |
| 124 | + return self.dataset["test"] |
| 125 | + |
| 126 | + def fewshot_examples(self): |
| 127 | + """Loads and returns the few-shot examples for the task if they exist.""" |
| 128 | + with open( |
| 129 | + "bigcode_eval/tasks/few_shot_examples/codexglue_code_to_code_trans_few_shot_prompts.json", |
| 130 | + "r", |
| 131 | + ) as file: |
| 132 | + examples = json.load(file) |
| 133 | + return examples[self.direction] |
| 134 | + |
| 135 | + @staticmethod |
| 136 | + def two_shot_prompt(entry, source_code, examples, source_name, target_name): |
| 137 | + """Two shot prompt format with source and target code examples. |
| 138 | + |
| 139 | + :param entry: str |
| 140 | + Instruction prefix for the task |
| 141 | + :param source_code: str |
| 142 | + The source code to translate |
| 143 | + :param examples: dict |
| 144 | + Few-shot examples containing source1, target1, source2, target2 |
| 145 | + :param source_name: str |
| 146 | + Name of the source language (e.g., 'Java') |
| 147 | + :param target_name: str |
| 148 | + Name of the target language (e.g., 'C#') |
| 149 | + :return: str |
| 150 | + The complete prompt |
| 151 | + """ |
| 152 | + prompt = f"""{entry} |
| 153 | +{source_name}: |
| 154 | +{examples['source1']} |
| 155 | +{target_name}: |
| 156 | +{examples['target1']} |
| 157 | +
|
| 158 | +{source_name}: |
| 159 | +{examples['source2']} |
| 160 | +{target_name}: |
| 161 | +{examples['target2']} |
| 162 | +
|
| 163 | +{source_name}: |
| 164 | +{source_code} |
| 165 | +{target_name}: |
| 166 | +""" |
| 167 | + return prompt |
| 168 | + |
| 169 | + def get_prompt(self, doc): |
| 170 | + """Builds the prompt for the LM to generate from. |
| 171 | + |
| 172 | + :param doc: dict[str: str] |
| 173 | + sample from the test dataset |
| 174 | + :return: str |
| 175 | + """ |
| 176 | + source_name = self.direction_config["source_name"] |
| 177 | + target_name = self.direction_config["target_name"] |
| 178 | + source_field = self.direction_config["source"] |
| 179 | + |
| 180 | + source_code = doc[source_field].strip() |
| 181 | + entry = f"Translate the following code from {source_name} to {target_name}:\n" |
| 182 | + examples = self.fewshot_examples() |
| 183 | + prompt = self.two_shot_prompt(entry, source_code, examples, source_name, target_name) |
| 184 | + return prompt |
| 185 | + |
| 186 | + def get_reference(self, doc): |
| 187 | + """Builds the reference solution for the doc (sample from the test dataset). |
| 188 | + |
| 189 | + :param doc: dict[str: str] |
| 190 | + sample from the test dataset |
| 191 | + :return: str |
| 192 | + """ |
| 193 | + target_field = self.direction_config["target"] |
| 194 | + return doc[target_field].strip() |
| 195 | + |
| 196 | + def postprocess_generation(self, generation, idx): |
| 197 | + """Defines the postprocessing for a LM generation. |
| 198 | + |
| 199 | + :param generation: str |
| 200 | + code generation from LM |
| 201 | + :param idx: int |
| 202 | + index of doc in the dataset to which the generation belongs |
| 203 | + (not used for this task) |
| 204 | + :return: str |
| 205 | + """ |
| 206 | + target_name = self.direction_config["target_name"] |
| 207 | + # Extract the generated code after the last target language marker |
| 208 | + marker = f"{target_name}:\n" |
| 209 | + if marker in generation: |
| 210 | + output = generation.split(marker)[-1] |
| 211 | + else: |
| 212 | + output = generation |
| 213 | + |
| 214 | + # Clean up the output - take first complete function/method |
| 215 | + output = output.strip() |
| 216 | + |
| 217 | + # Stop at double newlines or comment markers that might indicate end of function |
| 218 | + for stop in ["\n\n", "\n//", "\n/*"]: |
| 219 | + if stop in output: |
| 220 | + output = output.split(stop)[0] |
| 221 | + |
| 222 | + return output.strip() |
| 223 | + |
| 224 | + def process_results(self, generations, references): |
| 225 | + """Takes the list of LM generations and evaluates them against ground truth references, |
| 226 | + returning the CodeBLEU metric for the generations. |
| 227 | + |
| 228 | + CodeBLEU combines: |
| 229 | + - ngram_match_score: Standard n-gram matching (like BLEU) |
| 230 | + - weighted_ngram_match_score: N-gram matching weighted by syntax |
| 231 | + - syntax_match_score: AST-based syntax matching |
| 232 | + - dataflow_match_score: Semantic dataflow matching |
| 233 | + - codebleu: Combined score (weighted average of above) |
| 234 | + |
| 235 | + :param generations: list(list(str)) |
| 236 | + list of lists containing generations |
| 237 | + :param references: list(str) |
| 238 | + list of str containing references |
| 239 | + :return: dict[str: float] |
| 240 | + """ |
| 241 | + # Extract the first generation from each list |
| 242 | + predictions = [gen[0] for gen in generations] |
| 243 | + |
| 244 | + # Get the target language for CodeBLEU evaluation |
| 245 | + lang = self.direction_config["codebleu_lang"] |
| 246 | + |
| 247 | + # Compute CodeBLEU score |
| 248 | + # calc_codebleu expects references as list of strings (one per sample) |
| 249 | + # and predictions as list of strings (one per sample) |
| 250 | + results = calc_codebleu( |
| 251 | + references=references, |
| 252 | + predictions=predictions, |
| 253 | + lang=lang, |
| 254 | + ) |
| 255 | + |
| 256 | + return results |
| 257 | + |
0 commit comments