-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessor.py
More file actions
81 lines (66 loc) · 2.87 KB
/
preprocessor.py
File metadata and controls
81 lines (66 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from typing import Any
import datasets as hf_datasets
from tqdm import tqdm
def filter_non_ascii(text: str) -> str:
"""
Remove non-ASCII characters from text.
"""
return ''.join(char for char in text if ord(char) < 128)
def clean_docstring(doc_string: str) -> str:
"""
Preprocess the documentation string
"""
# Split the documentation into lines
lines = doc_string.split("\n")
processed_lines = []
for line in lines:
stripped_line = line.strip()
# Stop if we encounter an empty line
if not stripped_line:
break
processed_lines.append(stripped_line)
return filter_non_ascii(". ".join(processed_lines))
def clean_code(code: str) -> str:
"""
Normalize code indentation to PEP 8 standards:
- Use 4 spaces per indentation level.
- Dynamically adjust indentation levels based on leading spaces.
- Skip empty lines for indentation calculations.
"""
lines = code.split("\n")
cleaned_lines = []
current_indent_level = 0 # Track the current indentation level
previous_spaces = 0 # Track the leading spaces of the last non-empty line
for line in lines:
stripped_line = line.lstrip() # Remove leading whitespace
leading_spaces = len(line) - len(stripped_line) # Count leading spaces
if not stripped_line: # If the line is empty
cleaned_lines.append("") # Preserve it as a blank line
continue # Skip further processing for this line
# Compare leading spaces with the previous meaningful line
if leading_spaces > previous_spaces:
current_indent_level += 1 # Increase indentation level
elif leading_spaces < previous_spaces:
current_indent_level = max(0, current_indent_level - 1) # Decrease indentation level
# Update the previous_spaces for the next comparison
previous_spaces = leading_spaces
# Construct the cleaned line with spaces
cleaned_line = (" " * (current_indent_level * 4)) + stripped_line
cleaned_lines.append(cleaned_line)
return filter_non_ascii("\n".join(cleaned_lines))
def preprocess_batch(dataset: hf_datasets.Dataset)-> list[dict[str, str]]:
filtered_data = []
for record in tqdm(dataset):
if record['func_documentation_string'] and record['func_code_string']:
filtered_data.append({
"description": clean_docstring(record['func_documentation_string']),
"code": clean_code(record['func_code_string'])
})
return filtered_data
def preprocess_record(record: dict[Any, Any])-> dict[str, str] | None:
if record['func_documentation_string'] and record['func_code_string']:
return {
"description": clean_docstring(record['func_documentation_string']),
"code": clean_code(record['func_code_string'])
}
return None