diff --git a/PROJECT_ISSUES.md b/PROJECT_ISSUES.md new file mode 100644 index 0000000..657a8eb --- /dev/null +++ b/PROJECT_ISSUES.md @@ -0,0 +1,178 @@ +# CodeGeeX Project Issues List + +This document lists all identified issues in the CodeGeeX project. + +## 1. Hardcoded Paths (High Priority) ✅ FIXED + +### MindSpore Scripts +**Status: All hardcoded paths have been replaced with configurable options** + +All hardcoded paths have been fixed: + +- **`codegeex/mindspore/generation_values.py`** ✅ + - Now uses `--output_path` argument (defaults to `./output`) + - Output file: `output_values.npy` + +- **`codegeex/mindspore/generation_humaneval.py`** ✅ + - Now uses `--input_path` for dataset (with smart fallback to repository-relative paths) + - Now uses `--output_path` for save directory (defaults to `./output`) + - Language parameter is now properly used (no longer hardcoded to C++) + +- **`codegeex/mindspore/generation_finetune.py`** ✅ + - Now uses `--output_path` argument (defaults to `./output`) + +- **`codegeex/mindspore/generation_batch.py`** ✅ + - Now uses `--output_path` argument (defaults to `./output`) + +- **`codegeex/mindspore/src/dataset.py`** ✅ + - Now uses `eval_data_url` argument (if provided) + +- **`codegeex/mindspore/train.py`** ✅ + - Cache paths now use `CODEGEEX_CACHE_BASE` environment variable (defaults to `/home/work/sfs/cache`) + - `BATCH_JOB_ID` now uses `.get()` with fallback + +- **`codegeex/mindspore/scripts/run_modelarts*.py`** ✅ + - Temp directory now uses `MODELARTS_TEMP_DIR` environment variable (defaults to `/home/work/sfs/xx`) + - Added file existence checks before copying + +**New command-line arguments added:** +- `--output_path`: Output directory for generated files (default: `./output`) +- `--input_path`: Input path for data files (optional) + +**New environment variables:** +- `CODEGEEX_CACHE_BASE`: Base directory for cache files (default: `/home/work/sfs/cache`) +- `MODELARTS_TEMP_DIR`: Temp directory for ModelArts scripts (default: `/home/work/sfs/xx`) + +## 2. Configuration Placeholders (Medium Priority) + +Multiple configuration files contain placeholder values that must be set: + +- **`configs/codegeex_13b.sh`**: `CHECKPOINT_PATH` placeholder +- **`configs/codegeex_13b_parallel.sh`**: `CHECKPOINT_PATH` placeholder +- **`configs/codegeex_13b_paddle.sh`**: `CHECKPOINT_PATH` placeholder +- **`scripts/pretrain_codegeex.sh`**: + - `HOSTFILE` placeholder + - `DATA_PATH` placeholder + - `CKPT_PATH` placeholder + - `OUTPUT_DIR` placeholder +- **`scripts/finetune_codegeex.sh`**: Same placeholders as pretrain script +- **`codegeex/mindspore/configs/*.sh`**: Multiple config files with `CODE_DATA_DIR` and `` placeholders + +## 3. TODO Comments / Incomplete Code (Medium Priority) + +Multiple TODO comments indicate incomplete work: + +- **`codegeex/mindspore/train.py`** (lines 214, 216): + - TODO: remove after warming-up! + - TODO: add them back if not for the 1st run! + +- **`codegeex/mindspore/src/sat_dataset.py`** (line 81): + - TODO ARGS comment + +- **`codegeex/mindspore/src/dataset.py`** (line 122): + - TODO: set as current validation set path + +- **`codegeex/mindspore/generation_values_1p.py`** (line 166): + - TODO: add them back if not for the 1st run! + +- **`codegeex/mindspore/finetune.py`** (lines 216, 218): + - TODO: remove after warming-up! + - TODO: add them back if not for the 1st run! + +- **`codegeex/mindspore/generation_1p.py`** (line 166): + - TODO: add them back if not for the 1st run! + +- **`codegeex/mindspore/convertion_1p.py`** (lines 154, 160, 180): + - Multiple TODOs for checkpoint names and paths + +- All generation scripts have TODO comments for setting paths + +## 4. Security Issues (High Priority) + +- **`codegeex/benchmark/execution.py`** (line 347): + - Java execution code is commented out with security warning + - Warning states: "This program exists to execute untrusted model-generated code" + - Code execution should be sandboxed + - Currently the `exec_result` is None but code tries to access `.returncode` which will cause AttributeError + +- **`codegeex/benchmark/execution.py`** (lines 477-546): + - `reliability_guard()` function has explicit warning: "This function is NOT a security sandbox" + - Users should not blindly execute untrusted code + +## 5. Known Bugs (Documented) + +- **VS Code Extension** (mentioned in README): + - Bug: If cursor is moved before generation finishes, it may cause issues + - Location: `vscode-extension/README.md` and `README_zh.md` + - Status: Acknowledged, team working on making generation faster + +## 6. Debug/Test Code Left in Production (Low Priority) + +- **`scripts/evaluate_humaneval_x.py`** (lines 47-50): + - Hardcoded debug values left in code: + ```python + #Debugging + INPUT_FILE='/home/rog0d/Escritorio/CodeGeeX/generations/humaneval_rust_generations.jsonl.gz' + LANGUAGE='rust' + ``` + - These override command-line arguments + +## 7. Incomplete Implementation (Medium Priority) + +- **`tests/test_inference_paddle.py`** (line 149): + - `raise NotImplementedError("quantize")` - quantization not implemented for Paddle backend + +## 8. Path Construction Bug (Low Priority) + +- **`scripts/evaluate_humaneval_x.py`** (line 44): + - Incorrect path join: `os.path.join(MAIN_DIR, "/codegeex/benchmark/humaneval-x/")` + - Leading slash makes it an absolute path, ignoring MAIN_DIR + +## 9. Deprecated/Outdated Information + +- **`README.md`** (line 12): + - Notes that CodeGeeX4 is newer and released + - Current codebase may be considered legacy version + +## 10. Missing Error Handling + +- **`codegeex/benchmark/execution.py`** (line 348): + - Code accesses `exec_result.returncode` when `exec_result` is `None` (line 336) + - Will cause `AttributeError` - Java execution path is broken + +## 11. Hardcoded CUDA Path (Low Priority) + +- **`scripts/generate_humaneval_x.sh`** (line 13): + - `export CUDA_HOME=/usr/local/cuda-11.1/` - hardcoded CUDA version +- **`scripts/translate_humaneval_x.sh`** (line 14): + - Same hardcoded CUDA path + +## 12. Configuration Dependencies + +- Scripts require specific environment variables: + - `BATCH_JOB_ID` (used in train.py and scripts) + - Various NCCL environment variables + - Platform-specific paths for Ascend/MindSpore + +## Summary by Priority + +### Critical (Must Fix Before Production Use) +1. ~~Hardcoded paths in generation scripts~~ ✅ **FIXED** +2. Security issue: Java execution code broken (None.returncode error) +3. Configuration placeholders not set + +### High Priority +4. Security warnings for code execution +5. Debug code left in evaluate script +6. Known VS Code extension cursor bug + +### Medium Priority +7. Multiple TODO comments indicating incomplete work +8. Hardcoded CUDA paths +9. Path construction bug + +### Low Priority +10. Missing quantization implementation for Paddle +11. Deprecated version notice (CodeGeeX4 available) +12. Platform-specific hardcoded paths + diff --git a/codegeex/mindspore/generation_batch.py b/codegeex/mindspore/generation_batch.py index 06798fc..dccd5a6 100644 --- a/codegeex/mindspore/generation_batch.py +++ b/codegeex/mindspore/generation_batch.py @@ -226,15 +226,19 @@ def run_predict(model_predict, config, args_opt, rank): generations = [] batch_size = config.batch_size verbose = (rank % 8 == 0) - save_path = f'/home/work/sfs/xx/pangu_alpha_code/generation_batch/{args_opt.temperature}.txt' # TODO: set as current save path - save_dir = os.path.split(save_path)[0] + + # Use configurable output path + output_dir = getattr(args_opt, 'output_path', './output') + save_dir = os.path.join(output_dir, 'generation_batch') + save_path = os.path.join(save_dir, f'temp_{args_opt.temperature}.txt') + if rank == 0: - if not os.path.exists(save_dir): - os.makedirs(save_dir) + os.makedirs(save_dir, exist_ok=True) if not os.path.exists(save_path): - f = open(save_path, 'w') - f.close() - os.system(f'sudo chmod 777 -R {save_dir}') + with open(save_path, 'w') as f: + pass # Create empty file + if os.name != 'nt': # Only on Unix-like systems + os.system(f'chmod 777 -R {save_dir}') batch = [] input_length = [] sample_ids = [] diff --git a/codegeex/mindspore/generation_finetune.py b/codegeex/mindspore/generation_finetune.py index 9f2a96e..360e515 100644 --- a/codegeex/mindspore/generation_finetune.py +++ b/codegeex/mindspore/generation_finetune.py @@ -210,11 +210,14 @@ def run_predict(model_predict, config, args_opt, rank): generations = [] batch_size = config.batch_size verbose = (rank % 8 == 0) - save_path = f'/home/work/sfs/xx/pangu_alpha_code/generation_finetune/code_translation/{lang}/temp_{args_opt.temperature}.txt' # TODO: set as current save path - save_dir = os.path.split(save_path)[0] + + # Use configurable output path + output_dir = getattr(args_opt, 'output_path', './output') + save_dir = os.path.join(output_dir, 'generation_finetune', 'code_translation', lang) + save_path = os.path.join(save_dir, f'temp_{args_opt.temperature}.txt') + if rank == 0: - if not os.path.exists(save_dir): - os.makedirs(save_dir) + os.makedirs(save_dir, exist_ok=True) if not os.path.exists(save_path): f = open(save_path, 'w') f.close() diff --git a/codegeex/mindspore/generation_humaneval.py b/codegeex/mindspore/generation_humaneval.py index 030ff47..ecba456 100644 --- a/codegeex/mindspore/generation_humaneval.py +++ b/codegeex/mindspore/generation_humaneval.py @@ -15,6 +15,7 @@ """ PanGu predict run """ +import gzip import json import os import time @@ -198,25 +199,74 @@ def run_predict(model_predict, config, args_opt, rank): # Define tokenizer tokenizer = CodeTokenizer(mode='6b') - # Tokenize input sentence to ids - humaneval_path = '/home/work/sfs/xx/human_eval_x/data/humaneval_cpp.jsonl' # TODO: set as current humaneval path - humaneval = open(humaneval_path, 'r').readlines() - humaneval = [json.loads(task) for task in humaneval if len(task) != 0] + # Determine language (default to cpp for backward compatibility) + lang = getattr(args_opt, 'language', 'cpp') or 'cpp' + lang_lower = lang.lower() + + # Language tag mapping + lang_tags = { + 'cpp': '// language: C++\n', + 'c++': '// language: C++\n', + 'python': '# language: Python\n', + 'java': '// language: Java\n', + 'javascript': '// language: JavaScript\n', + 'js': '// language: JavaScript\n', + 'go': '// language: Go\n', + } + tag = lang_tags.get(lang_lower, f'// language: {lang}\n') + + # Determine input path + if hasattr(args_opt, 'input_path') and args_opt.input_path: + humaneval_path = args_opt.input_path + else: + # Try relative path from current script location + script_dir = os.path.dirname(os.path.abspath(__file__)) + repo_root = os.path.dirname(os.path.dirname(os.path.dirname(script_dir))) + default_path = os.path.join(repo_root, 'codegeex', 'benchmark', 'humaneval-x', + lang_lower, 'data', f'humaneval_{lang_lower}.jsonl.gz') + # Check if .gz file exists, otherwise try .jsonl + if os.path.exists(default_path): + humaneval_path = default_path + else: + default_path_jsonl = default_path.replace('.jsonl.gz', '.jsonl') + if os.path.exists(default_path_jsonl): + humaneval_path = default_path_jsonl + else: + # Fallback: use input_path or raise error + humaneval_path = default_path_jsonl + if rank == 0: + print(f"Warning: Default path {humaneval_path} does not exist. Please set --input_path") + + # Open file (handle .gz files) + if humaneval_path.endswith('.gz'): + with gzip.open(humaneval_path, 'rt') as f: + humaneval = [json.loads(line) for line in f if line.strip()] + else: + with open(humaneval_path, 'r') as f: + humaneval = [json.loads(line) for line in f if line.strip()] + samples = [task['prompt'] for task in humaneval] generations = [] batch_size = config.batch_size verbose = (rank % 8 == 0) - part = int(args_opt.part) + part = int(args_opt.part) if args_opt.part else 0 gen_times = 12 # TODO: set as generation times of current task print(f"gen times: {gen_times}, part: {part}") - save_path = f'/home/work/sfs/xx/pangu_alpha_code/generation_humanevalx/cpp/temp_{args_opt.temperature}/samples_{args_opt.load_ckpt_epoch}_part_{part}.jsonl' # TODO: set as current save path + + # Determine output path + output_dir = getattr(args_opt, 'output_path', './output') + os.makedirs(output_dir, exist_ok=True) + save_path = os.path.join(output_dir, + f'humaneval_{lang_lower}_temp_{args_opt.temperature}_samples_{args_opt.load_ckpt_epoch}_part_{part}.jsonl') + if rank == 0 and not os.path.exists(save_path): os.makedirs(os.path.split(save_path)[0], exist_ok=True) - f = open(save_path, 'w') - f.close() - os.system(f'sudo chmod 777 {save_path}') + with open(save_path, 'w') as f: + pass # Create empty file + if os.name != 'nt': # Only on Unix-like systems + os.system(f'chmod 777 {save_path}') + for i, sample in enumerate(samples): - tag = "// language: C++\n" sample = tag + sample if rank % 8 == 0: print(f"=================== prompt {i} ====================") diff --git a/codegeex/mindspore/generation_values.py b/codegeex/mindspore/generation_values.py index 7c81035..2209e07 100644 --- a/codegeex/mindspore/generation_values.py +++ b/codegeex/mindspore/generation_values.py @@ -199,9 +199,15 @@ def run_predict(model_predict, config, args_opt, rank): init, batch_valid_length) output = output_logits.asnumpy() if rank == 0: - np.save("/home/work/sfs/xx/pangu_alpha_code/output_6_7375_8.13.npy", output) # TODO: set as current save path - os.system( - "chmod 777 /home/work/sfs/xx/pangu_alpha_code/output_6_7375_8.13.npy") # TODO: set as current save path + # Use configurable output path + output_dir = getattr(args_opt, 'output_path', './output') + os.makedirs(output_dir, exist_ok=True) + output_file = os.path.join(output_dir, "output_values.npy") + np.save(output_file, output) + # Only try to chmod if on Unix-like system + if os.name != 'nt': + os.system(f"chmod 777 {output_file}") + print(f"== Output saved to: {output_file}") print("== Output shape: ", output.shape) diff --git a/codegeex/mindspore/scripts/run_modelarts.py b/codegeex/mindspore/scripts/run_modelarts.py index 05715a0..a7e25c5 100644 --- a/codegeex/mindspore/scripts/run_modelarts.py +++ b/codegeex/mindspore/scripts/run_modelarts.py @@ -21,11 +21,16 @@ os.environ["LOG_PATH"] = tb_path -print("=================RANK_TABLE_FILE: ", os.environ["RANK_TABLE_FILE"], flush=True) +print("=================RANK_TABLE_FILE: ", os.environ.get("RANK_TABLE_FILE", "not set"), flush=True) print("=================ms import done", flush=True) time.sleep(10) -os.system( - "cp /home/work/rank_table/jobstart_hccl.json /home/work/sfs/xx; sudo chmod +777 /home/work/rank_table/jobstart_hccl.json") +# Use configurable temp directory (platform-specific for ModelArts) +temp_dir = os.environ.get("MODELARTS_TEMP_DIR", "/home/work/sfs/xx") +rank_table_source = "/home/work/rank_table/jobstart_hccl.json" +if os.path.exists(rank_table_source): + os.system(f"cp {rank_table_source} {temp_dir}; sudo chmod +777 {rank_table_source}") +else: + print(f"Warning: {rank_table_source} does not exist. Skipping copy.") ret = os.system(f"cd {log_path} && bash {args.script} 2>&1 | tee output.log") if os.environ.get("RANK_ID") == 0: log_dir = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID")) diff --git a/codegeex/mindspore/scripts/run_modelarts_gen_finetune.py b/codegeex/mindspore/scripts/run_modelarts_gen_finetune.py index 2e5f1ec..0945dd5 100644 --- a/codegeex/mindspore/scripts/run_modelarts_gen_finetune.py +++ b/codegeex/mindspore/scripts/run_modelarts_gen_finetune.py @@ -26,11 +26,16 @@ else: os.environ["LANGUAGE"] = "Null" -print("=================RANK_TABLE_FILE: ", os.environ["RANK_TABLE_FILE"], flush=True) +print("=================RANK_TABLE_FILE: ", os.environ.get("RANK_TABLE_FILE", "not set"), flush=True) print("=================ms import done", flush=True) time.sleep(10) -os.system( - "cp /home/work/rank_table/jobstart_hccl.json /home/work/sfs/xx; sudo chmod +777 /home/work/rank_table/jobstart_hccl.json") +# Use configurable temp directory (platform-specific for ModelArts) +temp_dir = os.environ.get("MODELARTS_TEMP_DIR", "/home/work/sfs/xx") +rank_table_source = "/home/work/rank_table/jobstart_hccl.json" +if os.path.exists(rank_table_source): + os.system(f"cp {rank_table_source} {temp_dir}; sudo chmod +777 {rank_table_source}") +else: + print(f"Warning: {rank_table_source} does not exist. Skipping copy.") ret = os.system(f"cd {log_path} && bash {args.script} 2>&1 | tee output.log") if os.environ.get("RANK_ID") == 0: log_dir = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID")) diff --git a/codegeex/mindspore/scripts/run_modelarts_gen_humaneval_x.py b/codegeex/mindspore/scripts/run_modelarts_gen_humaneval_x.py index 256d7d1..6bae2d8 100644 --- a/codegeex/mindspore/scripts/run_modelarts_gen_humaneval_x.py +++ b/codegeex/mindspore/scripts/run_modelarts_gen_humaneval_x.py @@ -26,11 +26,16 @@ else: os.environ["PART"] = "-1" -print("=================RANK_TABLE_FILE: ", os.environ["RANK_TABLE_FILE"], flush=True) +print("=================RANK_TABLE_FILE: ", os.environ.get("RANK_TABLE_FILE", "not set"), flush=True) print("=================ms import done", flush=True) time.sleep(10) -os.system( - "cp /home/work/rank_table/jobstart_hccl.json /home/work/sfs/xx; sudo chmod +777 /home/work/rank_table/jobstart_hccl.json") +# Use configurable temp directory (platform-specific for ModelArts) +temp_dir = os.environ.get("MODELARTS_TEMP_DIR", "/home/work/sfs/xx") +rank_table_source = "/home/work/rank_table/jobstart_hccl.json" +if os.path.exists(rank_table_source): + os.system(f"cp {rank_table_source} {temp_dir}; sudo chmod +777 {rank_table_source}") +else: + print(f"Warning: {rank_table_source} does not exist. Skipping copy.") ret = os.system(f"cd {log_path} && bash {args.script} 2>&1 | tee output.log") if os.environ.get("RANK_ID") == 0: log_dir = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID")) diff --git a/codegeex/mindspore/src/dataset.py b/codegeex/mindspore/src/dataset.py index 725ecea..45f5737 100644 --- a/codegeex/mindspore/src/dataset.py +++ b/codegeex/mindspore/src/dataset.py @@ -118,8 +118,14 @@ def create_dataset(batch_size, data_path, args_opt, device_num=1, rank=0, drop=T num_parallel_workers = 4 train_data = get_code_data_train(data_path, args_opt, skip_num=(skip_num // num_parallel_workers)) if train_and_eval: - val_data = get_code_data_eval("/home/work/sfs/xx/data_valid", - args_opt) # TODO: set as current validation set path + # Use eval_data_url if provided, otherwise skip validation + val_data_path = getattr(args_opt, 'eval_data_url', None) + if val_data_path: + val_data = get_code_data_eval(val_data_path, args_opt) + else: + if rank == 0: + print("Warning: train_and_eval is True but eval_data_url is not set. Skipping validation data.") + val_data = None else: val_data = None diff --git a/codegeex/mindspore/src/utils.py b/codegeex/mindspore/src/utils.py index 767fc5d..6bb5ac4 100644 --- a/codegeex/mindspore/src/utils.py +++ b/codegeex/mindspore/src/utils.py @@ -310,6 +310,14 @@ def add_inference_params(opt): default="true", choices=["true", "false"], help="Whether enable state reuse") + opt.add_argument("--output_path", + type=str, + default="./output", + help="Output path for generated files") + opt.add_argument("--input_path", + type=str, + default=None, + help="Input path for data files (e.g., HumanEval dataset)") def add_training_params(opt): diff --git a/codegeex/mindspore/train.py b/codegeex/mindspore/train.py index 9ccd461..e5853e6 100644 --- a/codegeex/mindspore/train.py +++ b/codegeex/mindspore/train.py @@ -219,9 +219,13 @@ def run_train(args_opt): # args_opt.has_trained_steps = int(param_dict["step_num"].data.asnumpy()) # args_opt.has_trained_steps = 9000 - os.mkdir(f'/home/work/sfs/cache/{os.environ["BATCH_JOB_ID"]}/1/rank_{rank}') + # Use configurable cache directory (platform-specific for checkpoint synchronization) + cache_base = os.environ.get('CODEGEEX_CACHE_BASE', '/home/work/sfs/cache') + batch_job_id = os.environ.get('BATCH_JOB_ID', 'default_job') + cache_dir_1 = os.path.join(cache_base, batch_job_id, '1') + os.makedirs(os.path.join(cache_dir_1, f'rank_{rank}'), exist_ok=True) while True: - num = len(os.listdir(f'/home/work/sfs/cache/{os.environ["BATCH_JOB_ID"]}/1')) + num = len([d for d in os.listdir(cache_dir_1) if os.path.isdir(os.path.join(cache_dir_1, d)) and d.startswith('rank_')]) if num == device_num: break if rank % 64 == 0: @@ -280,9 +284,13 @@ def run_train(args_opt): net_not_load = load_param_into_net(pangu_alpha_with_loss, param_dict) opt_not_load = load_param_into_net(optimizer, param_dict) - os.mkdir(f'/home/work/sfs/cache/{os.environ["BATCH_JOB_ID"]}/2/rank_{rank}') + # Use configurable cache directory (platform-specific for checkpoint synchronization) + cache_base = os.environ.get('CODEGEEX_CACHE_BASE', '/home/work/sfs/cache') + batch_job_id = os.environ.get('BATCH_JOB_ID', 'default_job') + cache_dir_2 = os.path.join(cache_base, batch_job_id, '2') + os.makedirs(os.path.join(cache_dir_2, f'rank_{rank}'), exist_ok=True) while True: - num = len(os.listdir(f'/home/work/sfs/cache/{os.environ["BATCH_JOB_ID"]}/2')) + num = len([d for d in os.listdir(cache_dir_2) if os.path.isdir(os.path.join(cache_dir_2, d)) and d.startswith('rank_')]) if num == device_num: break if rank % 64 == 0: @@ -317,7 +325,7 @@ def run_train(args_opt): sink_size=args_opt.sink_size, dataset_sink_mode=True) finally: if args_opt.profiling: - jobid = os.environ["BATCH_JOB_ID"] + jobid = os.environ.get("BATCH_JOB_ID", "default_job") profiler.analyse() rank_id = rank if context.get_context("save_graphs"):