|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import sys |
| 4 | + |
| 5 | +def hex_to_decimal(utf8_string): |
| 6 | + assert(len(utf8_string) == 3) |
| 7 | + hex_dict = {} |
| 8 | + char_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F"] |
| 9 | + value_list = [0, 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] |
| 10 | + for key, value in zip (char_list, value_list): |
| 11 | + hex_dict[key] = value |
| 12 | + |
| 13 | + result = 0 |
| 14 | + length = len(utf8_string) |
| 15 | + for i in range(length): |
| 16 | + digit = utf8_string[length - 1 - i] |
| 17 | + result += hex_dict[digit] * (16 ** i) |
| 18 | + |
| 19 | + return result |
| 20 | + |
| 21 | +def get_unicode_dict(): |
| 22 | + unicode_dict = {} |
| 23 | + utf8_list = [("621", "'"), ("622", "|"),("623", ">"), |
| 24 | + ("624", "&"), ("625", "<"),("626", "}"), |
| 25 | + ("627", "A"), ("628", "b"),("629", "p"), |
| 26 | + ("62A", "t"), ("62B", "v"),("62C", "j"), |
| 27 | + ("62D", "H"), ("62E", "x"),("62F", "d"), |
| 28 | + ("630", "*"), ("631", "r"),("632", "z"), |
| 29 | + ("633", "s"), ("634", "$"),("635", "S"), |
| 30 | + ("636", "D"), ("637", "T"),("638", "Z"), |
| 31 | + ("639", "E"), ("63A", "g"),("640", "_"), |
| 32 | + ("641", "f"), ("642", "q"),("643", "k"), |
| 33 | + ("644", "l"), ("645", "m"),("646", "n"), |
| 34 | + ("647", "h"), ("648", "w"),("649", "Y"), |
| 35 | + ("64A", "y"), ("64B", "F"),("64C", "N"), |
| 36 | + ("64D", "K"), ("64E", "a"),("64F", "u"), |
| 37 | + ("650", "i"), ("651", "~"),("652", "o"), |
| 38 | + ("670", "`"), ("671", "{"),("67E", "P"), |
| 39 | + ("686", "J"), ("6A4", "V"),("6AF", "G")] |
| 40 | + |
| 41 | + for word_pair in utf8_list: |
| 42 | + utf8 = word_pair[0] |
| 43 | + char = word_pair[1] |
| 44 | + unicode_dict[hex_to_decimal(utf8)] = char |
| 45 | + |
| 46 | + return unicode_dict |
| 47 | + |
| 48 | + |
| 49 | +def convert(word, unicode_dict): |
| 50 | + word_list = [] |
| 51 | + for char in word: |
| 52 | + c_unicode = ord(char) |
| 53 | + if c_unicode in unicode_dict: |
| 54 | + word_list.append(unicode_dict[c_unicode]) |
| 55 | + |
| 56 | + return "".join(word_list) |
| 57 | + |
| 58 | +def process_arabic_text(arabic_text, unicode_dict): |
| 59 | + with open(arabic_text, 'r') as file: |
| 60 | + sentence_list = [] |
| 61 | + is_sentence = False |
| 62 | + for line in file.readlines(): |
| 63 | +#print(line.split()[0], is_sentence, line.split()[0] == "</P>") |
| 64 | + if len(line.split()) > 0: |
| 65 | + if line.split()[0] == "<P>": |
| 66 | + is_sentence = True |
| 67 | + |
| 68 | + elif (is_sentence and line.split()[0] != "</P>"): |
| 69 | + for word in line.split(): |
| 70 | + if word == '.': |
| 71 | + # when meet period ".", sentence_list should not be empty (do find sentence ending with two period) |
| 72 | + if (len(sentence_list) > 0): |
| 73 | + sentence = " ".join(sentence_list) |
| 74 | + print(sentence) |
| 75 | + sentence_list = [] |
| 76 | + elif word[-1] == ".": |
| 77 | + word = word[:-1] |
| 78 | + sentence_list.append(word) |
| 79 | + sentence = " ".join(sentence_list) |
| 80 | + print(sentence) |
| 81 | + sentence_list = [] |
| 82 | + else: |
| 83 | + word = word |
| 84 | + if word != '': |
| 85 | + sentence_list.append(word) |
| 86 | + |
| 87 | + if line.split()[0] == "</P>": |
| 88 | + is_sentence = False |
| 89 | + if (len(sentence_list) > 0): |
| 90 | + print(" ".join(sentence_list)) |
| 91 | + sentence_list = [] |
| 92 | + |
| 93 | + |
| 94 | + |
| 95 | +def main(): |
| 96 | + arabic_text = sys.argv[1] |
| 97 | + unicode_dict = get_unicode_dict() |
| 98 | + process_arabic_text(arabic_text, unicode_dict) |
| 99 | + |
| 100 | +if __name__ == "__main__": |
| 101 | + main() |
0 commit comments