Skip to content

Commit cf3b941

Browse files
authored
Clean S* TODOs (#116)
1 parent 07b64cd commit cf3b941

File tree

4 files changed

+13
-13
lines changed

4 files changed

+13
-13
lines changed

skythought/test-time-scaling/codecontest_evaluate_multiprocess.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ def get_accuracy_all_rounds(dataset, num_process_evaluate, method="selfdebug", t
309309
return final_accuracy
310310

311311

312-
def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to support all methods
312+
def generate_and_evaluate(arguments):
313313
"""
314314
Takes in a single dspy example, generate code and evaluate it.
315315
@@ -342,7 +342,7 @@ def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to supp
342342
# assert False
343343
## Initialize the code generator
344344
if method == "selfdebug":
345-
## initialize debug lm to be 40mini : TODO(Alex): delete this if not work, or add a new argument for this if this works
345+
## initialize debug lm to be 40mini : TODO: delete this if not work, or add a new argument for this if this works
346346
debug_lm = dspy.LM('openai/gpt-4o-mini', cache=use_dspy_cache)
347347
test_program = CodeGeneratorWithSelfDebug(extracted_tests, num_round=args.num_round, n=args.n, temperature=args.temperature,
348348
lm=lm, selection=args.selection, context=args.context, judge_lm=judge_lm, pre_computed_tests=tests,
@@ -357,7 +357,7 @@ def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to supp
357357
test_program = NaiveCodeGeneratorNoDSPy(args)
358358

359359

360-
# TODO: @DL support oracle
360+
# TODO: support oracle
361361
# if args.selection == "debug_all":
362362
# eval_metric=live_code_bench_evaluate_batch
363363
# else:

skythought/test-time-scaling/evaluate_multiprocess.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ def get_accuracy_all_rounds(dataset, num_process_evaluate, method="selfdebug", t
313313
return final_accuracy
314314

315315

316-
def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to support all methods
316+
def generate_and_evaluate(arguments): ##TODO take in a method here to support all methods
317317
"""
318318
Takes in a single dspy example, generate code and evaluate it.
319319
@@ -346,7 +346,7 @@ def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to supp
346346
# assert False
347347
## Initialize the code generator
348348
if method == "selfdebug":
349-
## initialize debug lm to be 40mini : TODO(Alex): delete this if not work, or add a new argument for this if this works
349+
## initialize debug lm to be 40mini : TODO delete this if not work, or add a new argument for this if this works
350350
debug_lm = dspy.LM('openai/gpt-4o-mini', cache=use_dspy_cache)
351351
test_program = CodeGeneratorWithSelfDebug(extracted_tests, num_round=args.num_round, n=args.n, temperature=args.temperature,
352352
lm=lm, selection=args.selection, context=args.context, judge_lm=judge_lm, pre_computed_tests=tests,

skythought/test-time-scaling/live_code_bench_execute.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ def unsafe_lcb_run_timeout_tests(timeout_input_list, completion, timeout, is_std
200200
result = manager.list()
201201
p = multiprocessing.Process(target=run_single_timeout_test_list, args=(timeout_input_list, completion, timeout, is_stdin, result))
202202
p.start()
203-
p.join(timeout = timeout * len(timeout_input_list) + 3) # TODO Alex: what should be the timeout here?
203+
p.join(timeout = timeout * len(timeout_input_list) + 3) # TODO what should be the timeout here?
204204
if p.is_alive():
205205
p.kill()
206206

@@ -247,7 +247,7 @@ def unsafe_lcb_runTests(problem, completion, timeout, runtime_debug, is_extracte
247247
p.start()
248248
# print(f"There are {len(test_cases)} test cases.")
249249
if fast_check:
250-
p.join(fast_check_global_time_out) # TODO Alex: Check whether number of task cases is correct
250+
p.join(fast_check_global_time_out) # TODO Check whether number of task cases is correct
251251
else:
252252
p.join(timeout = (timeout+1) * len(test_cases) + 5)
253253
if p.is_alive():

skythought/test-time-scaling/live_code_bench_program.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -882,7 +882,7 @@ def forward(
882882
):
883883
prompt = example["prompt"]
884884

885-
## TODO: (Alex) Here make sure this is the right place to read cache
885+
## TODO: Here make sure this is the right place to read cache
886886
if self.args.load_cached_preds:
887887
## load the cached prediction completions and get the public accuracy to replicate zipped_history
888888
codes = self.cached_preds_dict[task_id]['codes']
@@ -961,7 +961,7 @@ def forward(
961961
api_key=os.environ.get("OPENAI_API_KEY"), # This is the default and can be omitted
962962
)
963963
# print(f"Using {self.args.generator} to generate code")
964-
# TODO: DL, please clean up these massy naming
964+
# TODO: please clean up these massy naming
965965
generator = self.args.generator
966966
if generator == "4o-mini":
967967
generator = "gpt-4o-mini"
@@ -1090,7 +1090,7 @@ def forward(
10901090
zipped_history[n].append(("", 0, "", "", 0)) ## if any exception occur (like context window limit exceeded, fallback to simply empty completion)
10911091
# print(zipped_history[-1][-1])
10921092
# print(f"=" * 10 + "Finished generating selfdebug prediction" + "=" * 10)
1093-
return self.selection_function(zipped_history, task_id, prompt, is_stdin, example), None ## (Alex) example is newly added, could get rid of the redundancy for prompt and is_stdin
1093+
return self.selection_function(zipped_history, task_id, prompt, is_stdin, example), None ## example is newly added, could get rid of the redundancy for prompt and is_stdin
10941094

10951095
def get_anchor_break_and_feedback(self, prompt, pred, extracted_tests, public_test_acc, public_test_feedback_string, generated_test_anchors):
10961096
anchor_break = False
@@ -1312,7 +1312,7 @@ def selection_function(self, zipped_history, task_id, prompt, is_stdin, example)
13121312
]
13131313
# print(private_tests)
13141314
if self.selection == "generated_tests":
1315-
with dspy.context(lm=self.debug_lm): ## TODO (Alex): here I simply used debug_lm because debug_lm is fixed to be 4omini, we can make this an argument
1315+
with dspy.context(lm=self.debug_lm): ## TODO here I simply used debug_lm because debug_lm is fixed to be 4omini, we can make this an argument
13161316
timeout_input_list = generate_tests_for_one_example(example, generation_fun=generate_timeout_tests_repeat,num_timeout_tests=3)
13171317
best_rate = -1
13181318
public_correct_samples_pass_rate = []
@@ -1338,7 +1338,7 @@ def selection_function(self, zipped_history, task_id, prompt, is_stdin, example)
13381338
preds_pass = [
13391339
list(
13401340
map(
1341-
lambda test: 1# test["count"] # @DL: This is weird, should just reduce the same tests.
1341+
lambda test: 1# test["count"] # This is weird, should just reduce the same tests.
13421342
if check_test(
13431343
[test["test"]], post_process_code(public_correct_sample), 0, prompt, "dummy", runtime_debug=True, raw=True, is_extracted=False
13441344
)[0]
@@ -2042,7 +2042,7 @@ def generate_tests_for_whole_dataset(
20422042
zero_correct_tests_count,
20432043
)
20442044

2045-
def generate_tests_for_one_example(example, ##TODO :Alex, make sure what ever takes the output of this function to be able to handle the new output format
2045+
def generate_tests_for_one_example(example, ##TODO Make sure what ever takes the output of this function to be able to handle the new output format
20462046
generation_fun,
20472047
completions = None,
20482048
judge_lm=None,

0 commit comments

Comments
 (0)