diff --git a/.gitignore b/.gitignore index 7e90aa9..f7dfc06 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,4 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ +.vscode/ diff --git a/codepile/ghdiffs/ghdiffs_repo_filter.py b/codepile/ghdiffs/ghdiffs_repo_filter.py new file mode 100644 index 0000000..0082cfe --- /dev/null +++ b/codepile/ghdiffs/ghdiffs_repo_filter.py @@ -0,0 +1,98 @@ +import re +from turtle import position +import pandas as pd +import requests +import logging +from io import StringIO +import os +from tqdm import tqdm +import json + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s %(levelname)-8s %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') +logger = logging.getLogger(__name__) + + +def read_json_file(json_path: str): + with open(json_path, "r") as f: + return [json.loads(line) for line in f] + + +def write_json_file(json_path: str, obj: dict): + with open(json_path, "w") as jsonFile: + json.dump(obj, jsonFile) + + +class GitHubDiffFilter: + def __init__(self) -> None: + repo_file_url = "https://raw.githubusercontent.com/EleutherAI/github-downloader/master/github_repositories.csv" + repo_file_string = StringIO(requests.get(repo_file_url).text) + self.single_file_count_threshold = 10_000 + self.save_every = 1000 + if not os.path.isdir("tmp"): + logger.info(f"Sucessfully added `tmp/` path") + os.mkdir("tmp") + self.ckpt_path = os.path.join("tmp", "ckpt.json") + + if os.path.exists(self.ckpt_path): + self.to_start = read_json_file(self.ckpt_path)["index"] + else: + self.to_start = 0 + self.repos_df = pd.read_csv(repo_file_string) + self.top_repos_list = self.get_top_repo_list() + self.checkpoint_list = [] # Would contain the curr_index of the last saved file. + logger.info(f"Size of top repos : {len(self.top_repos_list)}") + + def get_top_repo_list(self) -> list: + return self.repos_df.iloc[:, 0].values.tolist() + + def get_diff_path_list(self, github_diff_path: str) -> list: + """ + Get all the files in the given path + returns : + github_diff_list (list) : GitHub Diff path. + """ + return os.listdir(github_diff_path) + + def __call__(self, github_diff_path, output_file_path): + github_diff_files = self.get_diff_path_list(github_diff_path) + logger.info(f"Starting to process {len(github_diff_files)} files..") + # index and stuffs. + content_output_count = 0 + file_index = 0 + output_list = [] + output_file_index = 0 + + # Start_ind + github_diff_files = github_diff_files[self.to_start:] + for github_diff_file in tqdm(github_diff_files, total=len(github_diff_files)): + file_index += 1 # File index. + + if file_index == self.save_every: + write_json_file(self.ckpt_path, {"index": file_index}) + github_diff_file_path = os.path.join(github_diff_path, github_diff_file) + github_diff_content_list = read_json_file(github_diff_file_path) + for ind_content in tqdm(github_diff_content_list, total=len(github_diff_content_list), leave=False): + if ind_content["repo_name"] in self.top_repos_list: + output_list.append(ind_content) + content_output_count += 1 + self.checkpoint_list.append(file_index) + + if content_output_count == self.single_file_count_threshold: + # If the number of elements in the list is + content_df = pd.DataFrame.from_dict(output_list, orient="columns") + output_inter_file_path = os.path.join(output_file_path, str(output_file_index) + ".json") + content_df.to_json(output_inter_file_path, orient="records", lines=True) + output_file_index += 1 + + # Final Chunk goes here.. + write_json_file(self.ckpt_path, {"index": file_index}) + content_df = pd.DataFrame.from_dict(output_list, orient="columns") + output_final_file_path = os.path.join(output_file_path, str(output_file_index) + ".json") + content_df.to_json(output_final_file_path, orient="records", lines=True) + + +if __name__ == "__main__": + gh_diff_filter = GitHubDiffFilter() diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py new file mode 100644 index 0000000..753446f --- /dev/null +++ b/codepile/ghdiffs/ghdiffs_scrape.py @@ -0,0 +1,183 @@ +import json +from typing import Any +import urllib.request +import argparse +from pathlib import Path +import dask.bag as db +from codepile.dataset import (Dataset, DatasetInfo, RawDataset, Scraper) +from dask.distributed import Client, progress, LocalCluster +from unidiff import PatchSet + + +def process_ind_patch(patch_diff) -> dict: + """Process patch to get diff data.""" + patch_parsed_diff: dict = { + "hunks": [], + "hunks_process": [], + } + + patch_parsed_diff["addition_count"] = patch_diff.added + patch_parsed_diff["deletion_count"] = patch_diff.removed + patch_parsed_diff["src_file"] = patch_diff.source_file + patch_parsed_diff["tgt_file"] = patch_diff.target_file + if patch_parsed_diff["tgt_file"] == "/dev/null": + patch_parsed_diff["file_extension"] = Path(patch_diff.source_file).suffix + else: + patch_parsed_diff["file_extension"] = Path(patch_diff.target_file).suffix + for patch_diff_ind in patch_diff: + patch_diff_ind = str(patch_diff_ind) + patch_diff_split = patch_diff_ind.split("@@") + patch_diff_line = patch_diff_split[2].split("\n") + patch_diff_line_numbers = [list(map(int, hunk.strip("-+").split(","))) + for hunk in patch_diff_split[1].strip().split(" ")] + patch_parsed_diff["hunks_process"].append(patch_diff_line_numbers + patch_diff_line[:-1]) + patch_parsed_diff["hunks"].append(patch_diff_ind) + patch_parsed_diff["hunks"] = "".join(patch_parsed_diff["hunks"]) + return patch_parsed_diff + + +def get_before_file(file_diff: dict, commit_hash: str, repo_name: str, length_threshold: int) -> str: + repo_owner, repo_name = repo_name.split("/") + if file_diff["src_file"] == "/dev/null": + raw_file: Any = ["ADDFILE"] + elif file_diff["tgt_file"] == "/dev/null": + # If file is deleted, get before file from the raw diff, which will be the full file. + raw_file = [line[1:] + "\n" for line in file_diff["hunks_process"][0][3:]] + else: + # Get raw after file. + file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/" + f"{repo_name}/{commit_hash}/{file_diff['tgt_file'][2:]}") + try: + raw_file = urllib.request.urlopen(file_raw_url) + raw_file_encoding = raw_file.headers.get_charsets()[0] + raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()] + if length_threshold > 0 and len(raw_file) > length_threshold: + return "" + except Exception as e: + return "" + # Iterate over hunks for this file and apply the reverse patch. + for hunk in file_diff["hunks_process"]: + hunk_list = [] + for line in hunk[3:]: + if line.startswith("-") or line.startswith(" "): + hunk_list.append(line[1:] + "\n") + raw_file[hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list + del file_diff["hunks_process"] # Deletes this item from the dict in parent functions + return "".join(raw_file) + + +def process_commit(commit_data: dict, config: argparse.Namespace) -> list[dict]: + """ + Process a commit dictionary to get the before files and diff dict. + + Args: + commit_data (dict): Dictionary containing commit hash, repo name, and + commit message. + + Returns: + list[dict]: A list of dicts, where each dict contains the data for a + change to a single file. + """ + if config.python_only and commit_data["language_name"] != "Python": + return [] + # Scrape a commit's diff file. + diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff" + try: + diff = urllib.request.urlopen(diff_url) + encoding = diff.headers.get_charsets()[0] + patch = PatchSet(diff, encoding=encoding) + if len(patch) == 0: + return [] + except Exception as e: + # print(e, diff_url) + return [] + commit_list: list[dict] = [] + # Iterate over files within the diff. + for patch_ind in patch: + if config.ignore_deletions and patch_ind.target_file == "/dev/null": + continue + if config.diff_length_threshold > 0 and sum(len(hunk) for hunk in patch_ind) > config.diff_length_threshold: + continue + # Filter non-text files. + if patch_ind.added == 0 and patch_ind.removed == 0: + continue + diff_dict: dict = process_ind_patch(patch_ind) + diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"], + length_threshold=config.code_length_threshold) + if not diff_dict["before_file"]: + # Happens if exception is thrown or file is too long. + continue + diff_dict["commit"] = commit_data["commit"] + diff_dict["message"] = commit_data["message"] + diff_dict["repo_name"] = commit_data["repo_name"] + diff_dict["language_name"] = commit_data["language_name"] + diff_dict["author_name"] = commit_data["author"]["name"] + diff_dict["license"] = commit_data["license"] + commit_list.append(diff_dict) + return commit_list + + +class GitHubDiffDataset(Dataset): + def __init__(self, config): + self.config = config + self.scraper = GitHubDiffScraper(self.config) + + def download(self, *args, **kwargs) -> RawDataset: + return self.scraper.scrape() + + def process(self): + raise NotImplementedError + + @property + def info(self) -> DatasetInfo: + return DatasetInfo( + id="GitHubDiffDataset", + description="Dataset of diffs from GitHub") + + @property + def id(self) -> str: + return "" + + +class GitHubDiffScraper(Scraper): + def __init__(self, config): + # TODO: Dask multi-node scheduling here + self.config = config + cluster = LocalCluster(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker) + self.client = Client(cluster) + self.read_path = Path(config.read_path) + self.save_path = Path(config.save_path) + + def scrape(self) -> RawDataset: + meta_spec = {'hunks': str, 'addition_count': int, 'deletion_count': int, + 'src_file': str, 'tgt_file': str, 'file_extension': str, + 'before_file': str, 'commit': str, 'message': str, + 'repo_name': str, 'language_name': str, 'author_name': str, + 'license': str} + result = ( + db.read_text(self.read_path).map(json.loads) + .map(process_commit, config=self.config).flatten().to_dataframe(meta=meta_spec) + .to_parquet(self.save_path) + ) + progress(result) + dataset = RawDataset(storage_uris=["https://github.com/CarperAI/Code-Pile"], complete=True) + return dataset + + +if __name__ == "__main__": + parser = argparse.ArgumentParser('codepile dataset tool') + + parser.add_argument('--read_path', type=str) + parser.add_argument('--save_path', type=str) + parser.add_argument('--n_workers', type=int, default=8) + parser.add_argument('--threads_per_worker', type=int, default=1) + parser.add_argument('--python_only', type=bool, default=False) + parser.add_argument('--diff_length_threshold', type=int, default=1000, + help="Maximum number of lines in the diff for a *single* file. Set to 0 for no limit.") + parser.add_argument('--code_length_threshold', type=int, default=1000, + help="Maximum number of lines in code files. Set to 0 for no limit.") + parser.add_argument('--ignore_deletions', type=bool, default=True, + help="Ignore file deletion diffs.") + config = parser.parse_args() + ghdiff_dataset = GitHubDiffDataset(config) + ghdiff_dataset.download() diff --git a/codepile/ghdiffs/test/ghdiffs_dummy.parquet b/codepile/ghdiffs/test/ghdiffs_dummy.parquet new file mode 100644 index 0000000..e30733b Binary files /dev/null and b/codepile/ghdiffs/test/ghdiffs_dummy.parquet differ