-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreporting.py
More file actions
139 lines (110 loc) · 5.96 KB
/
reporting.py
File metadata and controls
139 lines (110 loc) · 5.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#CHATGPT
import json
import os
stopwords = {
"a", "about", "above", "after", "again", "against", "all", "am", "an",
"and", "any", "are", "aren't", "as", "at", "be", "because", "been",
"before", "being", "below", "between", "both", "but", "by", "can't",
"cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't",
"doing", "don't", "down", "during", "each", "few", "for", "from", "further",
"had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd",
"he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself",
"his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into",
"is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most",
"mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once",
"only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over",
"own", "same", "shan't", "she", "she'd", "she'll", "she's", "should",
"shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their",
"theirs", "them", "themselves", "then", "there", "there's", "these", "they",
"they'd", "they'll", "they're", "they've", "this", "those", "through", "to",
"too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll",
"we're", "we've", "were", "weren't", "what", "what's", "when", "when's",
"where", "where's", "which", "while", "who", "who's", "whom", "why", "why's",
"with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're",
"you've", "your", "yours", "yourself", "yourselves"
}
def tokenize(raw_words):
'''
Gets a list of words (raw_words) and tokenizes it
Returns: dict of unique words : frequency (unique_word_counter)
'''
tokenized_words = [] # List of words after tokenized
unique_word_counter = {} # Manually count words using a dictionary
# Eliminate stopwords
for word in raw_words:
word = word.lower()
cleaned_word = ''.join(char for char in word if char.isalnum())
if cleaned_word and cleaned_word not in stopwords and len(cleaned_word) > 1:
tokenized_words.append(cleaned_word)
# Populate the unique_word_counter dict with unique words : frequency
for word in tokenized_words:
if word in unique_word_counter:
unique_word_counter[word] += 1 # Increment count if word is already in dictionary
else:
unique_word_counter[word] = 1
return unique_word_counter # will have all words not in stopwords
def create_report(json_files):
'''
Parameters: json_files = list of json file names
Writes: To be implemented later
Returns: Nothing, writes to files
'''
longest_page_info = ("", 0) # Initial longest tuple (url, word count)
subdomain_counter = {} # Regular dictionary to count unique URLs per subdomain
tokenized_words = {} # Dict of unique words : freq maintained for all the pages combined
url_highest_words = ("x", 0)
page_count = 0
# Iterates through each json file, populating our designated data structures
for json_file in json_files:
# Grabbing and opening file
file_path = os.path.join("tokens_by_subdomain", json_file) # Full path to the file
with open(file_path, 'r') as f:
json_data = json.load(f)
tokenized_word_count = 0
keys = list(json_data.keys()) # Get all keys as a list
subdomain_name = keys[1] # Access the second key
myData = json_data[subdomain_name] #gets the data for tokens
dict_counter = 0
for dictionary in myData:
for url, words in dictionary.items():
dict_counter += 1
word_counter = 0
# Tokenize myData and update the main tokenized_words dictionary
file_token_counts = tokenize(words) # Dictionary of word: frequency for the current file
for word, count in file_token_counts.items():
tokenized_word_count += count
word_counter += count
if word in tokenized_words:
tokenized_words[word] += count
else:
tokenized_words[word] = count
if word_counter > url_highest_words[1]:
url_highest_words = (url, word_counter)
# Check for the longest page for the report
if tokenized_word_count > longest_page_info[1]:
longest_page_info = (json_file, tokenized_word_count)
page_count += 1
# Increment the count for the subdomain
if subdomain_name in subdomain_counter:
subdomain_counter[subdomain_name] += 1 # Increment if exists
else:
subdomain_counter[subdomain_name] = 1 # Start count if new
# Num of unique pages
print(f"\033[1m\033[4mUnique Pages:\033[0m\n{page_count}\n")
# Longest Page
print(f"\033[1m\033[4mLongest page:\033[0m \nURL : {url_highest_words[0]}\nWord Count : {url_highest_words[1]}")
# Array of 50 most common words manually
most_common_words = sorted(tokenized_words.items(), key=lambda x: x[1], reverse=True)[:50]
print("\033[1m\033[4m\n50 Most Common Words:\033[0m")
words = [t[0] for t in most_common_words]
print(words)
print("\033[1m\033[4m\nSubdomains in uci.edu:\033[0m")
for subdomain, count in sorted(subdomain_counter.items()):
print(f"{subdomain}, {count}")
print()
def main():
# Returns a list of all the files in the tokens_by_subdomain folder
json_files = [f for f in os.listdir("tokens_by_subdomain") if f.endswith('.json')]
create_report(json_files)
if __name__ == "__main__":
main()