Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
.vscode/
98 changes: 98 additions & 0 deletions codepile/ghdiffs/ghdiffs_repo_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import re
from turtle import position
import pandas as pd
import requests
import logging
from io import StringIO
import os
from tqdm import tqdm
import json

logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)-8s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)


def read_json_file(json_path: str):
with open(json_path, "r") as f:
return [json.loads(line) for line in f]


def write_json_file(json_path: str, obj: dict):
with open(json_path, "w") as jsonFile:
json.dump(obj, jsonFile)


class GitHubDiffFilter:
def __init__(self) -> None:
repo_file_url = "https://raw.githubusercontent.com/EleutherAI/github-downloader/master/github_repositories.csv"
repo_file_string = StringIO(requests.get(repo_file_url).text)
self.single_file_count_threshold = 10_000
self.save_every = 1000
if not os.path.isdir("tmp"):
logger.info(f"Sucessfully added `tmp/` path")
os.mkdir("tmp")
self.ckpt_path = os.path.join("tmp", "ckpt.json")

if os.path.exists(self.ckpt_path):
self.to_start = read_json_file(self.ckpt_path)["index"]
else:
self.to_start = 0
self.repos_df = pd.read_csv(repo_file_string)
self.top_repos_list = self.get_top_repo_list()
self.checkpoint_list = [] # Would contain the curr_index of the last saved file.
logger.info(f"Size of top repos : {len(self.top_repos_list)}")

def get_top_repo_list(self) -> list:
return self.repos_df.iloc[:, 0].values.tolist()

def get_diff_path_list(self, github_diff_path: str) -> list:
"""
Get all the files in the given path
returns :
github_diff_list (list) : GitHub Diff path.
"""
return os.listdir(github_diff_path)

def __call__(self, github_diff_path, output_file_path):
github_diff_files = self.get_diff_path_list(github_diff_path)
logger.info(f"Starting to process {len(github_diff_files)} files..")
# index and stuffs.
content_output_count = 0
file_index = 0
output_list = []
output_file_index = 0

# Start_ind
github_diff_files = github_diff_files[self.to_start:]
for github_diff_file in tqdm(github_diff_files, total=len(github_diff_files)):
file_index += 1 # File index.

if file_index == self.save_every:
write_json_file(self.ckpt_path, {"index": file_index})
github_diff_file_path = os.path.join(github_diff_path, github_diff_file)
github_diff_content_list = read_json_file(github_diff_file_path)
for ind_content in tqdm(github_diff_content_list, total=len(github_diff_content_list), leave=False):
if ind_content["repo_name"] in self.top_repos_list:
output_list.append(ind_content)
content_output_count += 1
self.checkpoint_list.append(file_index)

if content_output_count == self.single_file_count_threshold:
# If the number of elements in the list is
content_df = pd.DataFrame.from_dict(output_list, orient="columns")
output_inter_file_path = os.path.join(output_file_path, str(output_file_index) + ".json")
content_df.to_json(output_inter_file_path, orient="records", lines=True)
output_file_index += 1

# Final Chunk goes here..
write_json_file(self.ckpt_path, {"index": file_index})
content_df = pd.DataFrame.from_dict(output_list, orient="columns")
output_final_file_path = os.path.join(output_file_path, str(output_file_index) + ".json")
content_df.to_json(output_final_file_path, orient="records", lines=True)


if __name__ == "__main__":
gh_diff_filter = GitHubDiffFilter()
183 changes: 183 additions & 0 deletions codepile/ghdiffs/ghdiffs_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import json
from typing import Any
import urllib.request
import argparse
from pathlib import Path
import dask.bag as db
from codepile.dataset import (Dataset, DatasetInfo, RawDataset, Scraper)
from dask.distributed import Client, progress, LocalCluster
from unidiff import PatchSet


def process_ind_patch(patch_diff) -> dict:
"""Process patch to get diff data."""
patch_parsed_diff: dict = {
"hunks": [],
"hunks_process": [],
}

patch_parsed_diff["addition_count"] = patch_diff.added
patch_parsed_diff["deletion_count"] = patch_diff.removed
patch_parsed_diff["src_file"] = patch_diff.source_file
patch_parsed_diff["tgt_file"] = patch_diff.target_file
if patch_parsed_diff["tgt_file"] == "/dev/null":
patch_parsed_diff["file_extension"] = Path(patch_diff.source_file).suffix
else:
patch_parsed_diff["file_extension"] = Path(patch_diff.target_file).suffix
for patch_diff_ind in patch_diff:
patch_diff_ind = str(patch_diff_ind)
patch_diff_split = patch_diff_ind.split("@@")
patch_diff_line = patch_diff_split[2].split("\n")
patch_diff_line_numbers = [list(map(int, hunk.strip("-+").split(",")))
for hunk in patch_diff_split[1].strip().split(" ")]
patch_parsed_diff["hunks_process"].append(patch_diff_line_numbers + patch_diff_line[:-1])
patch_parsed_diff["hunks"].append(patch_diff_ind)
patch_parsed_diff["hunks"] = "".join(patch_parsed_diff["hunks"])
return patch_parsed_diff


def get_before_file(file_diff: dict, commit_hash: str, repo_name: str, length_threshold: int) -> str:
repo_owner, repo_name = repo_name.split("/")
if file_diff["src_file"] == "/dev/null":
raw_file: Any = ["ADDFILE"]
elif file_diff["tgt_file"] == "/dev/null":
# If file is deleted, get before file from the raw diff, which will be the full file.
raw_file = [line[1:] + "\n" for line in file_diff["hunks_process"][0][3:]]
else:
# Get raw after file.
file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/"
f"{repo_name}/{commit_hash}/{file_diff['tgt_file'][2:]}")
try:
raw_file = urllib.request.urlopen(file_raw_url)
raw_file_encoding = raw_file.headers.get_charsets()[0]
raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()]
if length_threshold > 0 and len(raw_file) > length_threshold:
return ""
except Exception as e:
return ""
# Iterate over hunks for this file and apply the reverse patch.
for hunk in file_diff["hunks_process"]:
hunk_list = []
for line in hunk[3:]:
if line.startswith("-") or line.startswith(" "):
hunk_list.append(line[1:] + "\n")
raw_file[hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list
del file_diff["hunks_process"] # Deletes this item from the dict in parent functions
return "".join(raw_file)


def process_commit(commit_data: dict, config: argparse.Namespace) -> list[dict]:
"""
Process a commit dictionary to get the before files and diff dict.

Args:
commit_data (dict): Dictionary containing commit hash, repo name, and
commit message.

Returns:
list[dict]: A list of dicts, where each dict contains the data for a
change to a single file.
"""
if config.python_only and commit_data["language_name"] != "Python":
return []
# Scrape a commit's diff file.
diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff"
try:
diff = urllib.request.urlopen(diff_url)
encoding = diff.headers.get_charsets()[0]
patch = PatchSet(diff, encoding=encoding)
if len(patch) == 0:
return []
except Exception as e:
# print(e, diff_url)
return []
commit_list: list[dict] = []
# Iterate over files within the diff.
for patch_ind in patch:
if config.ignore_deletions and patch_ind.target_file == "/dev/null":
continue
if config.diff_length_threshold > 0 and sum(len(hunk) for hunk in patch_ind) > config.diff_length_threshold:
continue
# Filter non-text files.
if patch_ind.added == 0 and patch_ind.removed == 0:
continue
diff_dict: dict = process_ind_patch(patch_ind)
diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"],
length_threshold=config.code_length_threshold)
if not diff_dict["before_file"]:
# Happens if exception is thrown or file is too long.
continue
diff_dict["commit"] = commit_data["commit"]
diff_dict["message"] = commit_data["message"]
diff_dict["repo_name"] = commit_data["repo_name"]
diff_dict["language_name"] = commit_data["language_name"]
diff_dict["author_name"] = commit_data["author"]["name"]
diff_dict["license"] = commit_data["license"]
commit_list.append(diff_dict)
return commit_list


class GitHubDiffDataset(Dataset):
def __init__(self, config):
self.config = config
self.scraper = GitHubDiffScraper(self.config)

def download(self, *args, **kwargs) -> RawDataset:
return self.scraper.scrape()

def process(self):
raise NotImplementedError

@property
def info(self) -> DatasetInfo:
return DatasetInfo(
id="GitHubDiffDataset",
description="Dataset of diffs from GitHub")

@property
def id(self) -> str:
return ""


class GitHubDiffScraper(Scraper):
def __init__(self, config):
# TODO: Dask multi-node scheduling here
self.config = config
cluster = LocalCluster(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker)
self.client = Client(cluster)
self.read_path = Path(config.read_path)
self.save_path = Path(config.save_path)

def scrape(self) -> RawDataset:
meta_spec = {'hunks': str, 'addition_count': int, 'deletion_count': int,
'src_file': str, 'tgt_file': str, 'file_extension': str,
'before_file': str, 'commit': str, 'message': str,
'repo_name': str, 'language_name': str, 'author_name': str,
'license': str}
result = (
db.read_text(self.read_path).map(json.loads)
.map(process_commit, config=self.config).flatten().to_dataframe(meta=meta_spec)
.to_parquet(self.save_path)
)
progress(result)
dataset = RawDataset(storage_uris=["https://github.com/CarperAI/Code-Pile"], complete=True)
return dataset


if __name__ == "__main__":
parser = argparse.ArgumentParser('codepile dataset tool')

parser.add_argument('--read_path', type=str)
parser.add_argument('--save_path', type=str)
parser.add_argument('--n_workers', type=int, default=8)
parser.add_argument('--threads_per_worker', type=int, default=1)
parser.add_argument('--python_only', type=bool, default=False)
parser.add_argument('--diff_length_threshold', type=int, default=1000,
help="Maximum number of lines in the diff for a *single* file. Set to 0 for no limit.")
parser.add_argument('--code_length_threshold', type=int, default=1000,
help="Maximum number of lines in code files. Set to 0 for no limit.")
parser.add_argument('--ignore_deletions', type=bool, default=True,
help="Ignore file deletion diffs.")
config = parser.parse_args()
ghdiff_dataset = GitHubDiffDataset(config)
ghdiff_dataset.download()
Binary file added codepile/ghdiffs/test/ghdiffs_dummy.parquet
Binary file not shown.