|
| 1 | +#Functions to replace characters that have unicode numbers higher than 127 and hence are not in the ASCII set and |
| 2 | +#convert an Excel input file into an ASCII text output file |
| 3 | +#Code written by Ashwin Kammula; |
| 4 | +#Usage python -c 'import trialtrove_processing_wcomments; trialtrove_processing_wcomments.process("input.xlsx","output.txt") |
| 5 | +import pandas as pd |
| 6 | +import numpy as np |
| 7 | +import sys |
| 8 | +#character_conversion.py is a dictionary mapping special characters to their replacement strings |
| 9 | +from character_conversion_table import character_mapping |
| 10 | + |
| 11 | +#replace characters in string_list |
| 12 | +def replace(string_list,src,dest): |
| 13 | + return [x.replace(src,dest) if not pd.isnull(x) else np.nan for x in string_list] |
| 14 | + |
| 15 | +def string_to_print(src): |
| 16 | + return "newline" if src == "\n" else "tab" if src == "\t" else "," |
| 17 | + |
| 18 | +def convert_characters(inputFile, outputFile): |
| 19 | + file = open(inputFile, encoding="utf8") |
| 20 | + string = file.read() |
| 21 | + toReturn = "" |
| 22 | + |
| 23 | + for character in string: |
| 24 | + try: |
| 25 | + unicode = ord(character) |
| 26 | + match = character_mapping.get(unicode) |
| 27 | + #identify characters that do not have a replacement, so they can be fixed |
| 28 | + if match is None: |
| 29 | + if unicode <= 127: |
| 30 | + toReturn = toReturn + character |
| 31 | + else: |
| 32 | + print("New character: " + character + ", code: " + str(unicode), |
| 33 | + file=sys.stderr) |
| 34 | + else: |
| 35 | + toReturn = toReturn + match |
| 36 | + except Exception as e: |
| 37 | + print(e, file=sys.stderr) |
| 38 | + toReturn = toReturn + '' |
| 39 | + |
| 40 | + #three special cases for comparators flanked by ?, whih are unusual spaces in Excel |
| 41 | + toReturn = toReturn.replace('?=?', '=') |
| 42 | + toReturn = toReturn.replace('?<?', '<') |
| 43 | + toReturn = toReturn.replace('?>?', '>') |
| 44 | + |
| 45 | + output = open(outputFile, 'w') |
| 46 | + output.write(toReturn) |
| 47 | + |
| 48 | + #dictionary is used to count how often each character occurs |
| 49 | + dictionary = {} |
| 50 | + |
| 51 | + for character in toReturn: |
| 52 | + curr_count = dictionary.get(character) |
| 53 | + if curr_count is None: |
| 54 | + dictionary[character] = 1 |
| 55 | + else: |
| 56 | + dictionary[character] = curr_count + 1 |
| 57 | + |
| 58 | + keys = sorted(dictionary.keys()) |
| 59 | + |
| 60 | + #print to stdout how often each character occurred |
| 61 | + for key in keys: |
| 62 | + print("Character: " + key + ", Code: " + str(ord(key)) + ", Count: " + str(dictionary[key])) |
| 63 | + |
| 64 | +#input_file is the input, which should be an Excel file; output_file is the output in which string are replaced |
| 65 | +def process(input_file, output_file): |
| 66 | + trial_trove_xl = pd.read_excel(input_file) |
| 67 | + # Copy it so no changes are done on the original file |
| 68 | + trial_trove_xl_corrected = trial_trove_xl.copy(deep=True) |
| 69 | + #replace white space that are not plain spaces within a cell |
| 70 | + # Define strings from:to mapping |
| 71 | + replace_dic = {"\n":" ","\t":" "} |
| 72 | + |
| 73 | + # We only interested in columns of type "object" (= strings) |
| 74 | + for attribute in trial_trove_xl_corrected.columns[list(trial_trove_xl_corrected.dtypes == "object")]: |
| 75 | + for src,dest in replace_dic.items(): |
| 76 | + trial_trove_xl_corrected[attribute] = replace(trial_trove_xl_corrected[attribute],src,dest) |
| 77 | + |
| 78 | + # Check that no "invalid characters" remain |
| 79 | + flag = True |
| 80 | + for src in replace_dic.keys(): |
| 81 | + for attribute in trial_trove_xl_corrected.columns[list(trial_trove_xl_corrected.dtypes == "object")]: |
| 82 | + if any(trial_trove_xl_corrected[attribute].dropna().str.contains(src)): |
| 83 | + print(f"Attribute: \"{attribute}\" still contains {string_to_print(src)}") |
| 84 | + flag = False |
| 85 | + |
| 86 | + # If all is well, write corrected DataFrame to csv |
| 87 | + if flag: |
| 88 | + print("All invalid characters were replaced successfully") |
| 89 | + trial_trove_xl_corrected.to_csv(output_file, sep ='\t', index=False) |
| 90 | + convert_characters(output_file, output_file) |
| 91 | + else: |
| 92 | + print("Something is wrong. Check previous input for attributes still containing invalid characters") |
0 commit comments