Skip to content

Commit 6bbd533

Browse files
author
Alejandro A. Schaffer
committed
initial commit
1 parent 272e9d4 commit 6bbd533

File tree

1 file changed

+92
-0
lines changed

1 file changed

+92
-0
lines changed

trialtrove_processing_wcomments.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#Functions to replace characters that have unicode numbers higher than 127 and hence are not in the ASCII set and
2+
#convert an Excel input file into an ASCII text output file
3+
#Code written by Ashwin Kammula;
4+
#Usage python -c 'import trialtrove_processing_wcomments; trialtrove_processing_wcomments.process("input.xlsx","output.txt")
5+
import pandas as pd
6+
import numpy as np
7+
import sys
8+
#character_conversion.py is a dictionary mapping special characters to their replacement strings
9+
from character_conversion_table import character_mapping
10+
11+
#replace characters in string_list
12+
def replace(string_list,src,dest):
13+
return [x.replace(src,dest) if not pd.isnull(x) else np.nan for x in string_list]
14+
15+
def string_to_print(src):
16+
return "newline" if src == "\n" else "tab" if src == "\t" else ","
17+
18+
def convert_characters(inputFile, outputFile):
19+
file = open(inputFile, encoding="utf8")
20+
string = file.read()
21+
toReturn = ""
22+
23+
for character in string:
24+
try:
25+
unicode = ord(character)
26+
match = character_mapping.get(unicode)
27+
#identify characters that do not have a replacement, so they can be fixed
28+
if match is None:
29+
if unicode <= 127:
30+
toReturn = toReturn + character
31+
else:
32+
print("New character: " + character + ", code: " + str(unicode),
33+
file=sys.stderr)
34+
else:
35+
toReturn = toReturn + match
36+
except Exception as e:
37+
print(e, file=sys.stderr)
38+
toReturn = toReturn + ''
39+
40+
#three special cases for comparators flanked by ?, whih are unusual spaces in Excel
41+
toReturn = toReturn.replace('?=?', '=')
42+
toReturn = toReturn.replace('?<?', '<')
43+
toReturn = toReturn.replace('?>?', '>')
44+
45+
output = open(outputFile, 'w')
46+
output.write(toReturn)
47+
48+
#dictionary is used to count how often each character occurs
49+
dictionary = {}
50+
51+
for character in toReturn:
52+
curr_count = dictionary.get(character)
53+
if curr_count is None:
54+
dictionary[character] = 1
55+
else:
56+
dictionary[character] = curr_count + 1
57+
58+
keys = sorted(dictionary.keys())
59+
60+
#print to stdout how often each character occurred
61+
for key in keys:
62+
print("Character: " + key + ", Code: " + str(ord(key)) + ", Count: " + str(dictionary[key]))
63+
64+
#input_file is the input, which should be an Excel file; output_file is the output in which string are replaced
65+
def process(input_file, output_file):
66+
trial_trove_xl = pd.read_excel(input_file)
67+
# Copy it so no changes are done on the original file
68+
trial_trove_xl_corrected = trial_trove_xl.copy(deep=True)
69+
#replace white space that are not plain spaces within a cell
70+
# Define strings from:to mapping
71+
replace_dic = {"\n":" ","\t":" "}
72+
73+
# We only interested in columns of type "object" (= strings)
74+
for attribute in trial_trove_xl_corrected.columns[list(trial_trove_xl_corrected.dtypes == "object")]:
75+
for src,dest in replace_dic.items():
76+
trial_trove_xl_corrected[attribute] = replace(trial_trove_xl_corrected[attribute],src,dest)
77+
78+
# Check that no "invalid characters" remain
79+
flag = True
80+
for src in replace_dic.keys():
81+
for attribute in trial_trove_xl_corrected.columns[list(trial_trove_xl_corrected.dtypes == "object")]:
82+
if any(trial_trove_xl_corrected[attribute].dropna().str.contains(src)):
83+
print(f"Attribute: \"{attribute}\" still contains {string_to_print(src)}")
84+
flag = False
85+
86+
# If all is well, write corrected DataFrame to csv
87+
if flag:
88+
print("All invalid characters were replaced successfully")
89+
trial_trove_xl_corrected.to_csv(output_file, sep ='\t', index=False)
90+
convert_characters(output_file, output_file)
91+
else:
92+
print("Something is wrong. Check previous input for attributes still containing invalid characters")

0 commit comments

Comments
 (0)