55from sklearn .neighbors import NearestNeighbors
66import joblib
77
8+
89def get_skills_n_domains (skill_domain_dict ):
910 json .dump (skill_domain_dict , open ('skill_domain_dict.json' , 'w' ))
1011 skills = set ()
@@ -28,19 +29,19 @@ def get_skills_n_domains(skill_domain_dict):
2829 for skill in skills :
2930 oneHotSkillDomainList = []
3031 for domain in domains :
31- if (skill in skill_domain_map [domain ]):
32- oneHotSkillDomainList .append (1 )
33- else :
34- oneHotSkillDomainList .append (0 )
35- oneHotSkillDomainList .insert (0 ,skill )
32+ if (skill in skill_domain_map [domain ]):
33+ oneHotSkillDomainList .append (1 )
34+ else :
35+ oneHotSkillDomainList .append (0 )
36+ oneHotSkillDomainList .insert (0 , skill )
3637 SkillDomains .append (oneHotSkillDomainList )
3738 # print(SkillDomains)
38-
39+
3940 columns = []
4041 columns .extend (domains )
41- columns .insert (0 ,'SkillName' )
42- df_skill_n_domains = pd .DataFrame (SkillDomains ,columns = columns )
43- df_skill_n_domains .to_csv ('skill_n_domain.csv' ,index = False )
42+ columns .insert (0 , 'SkillName' )
43+ df_skill_n_domains = pd .DataFrame (SkillDomains , columns = columns )
44+ df_skill_n_domains .to_csv ('skill_n_domain.csv' , index = False )
4445
4546 SkillDomainsOneHot = []
4647 for skill in skills :
@@ -53,45 +54,50 @@ def get_skills_n_domains(skill_domain_dict):
5354 SkillDomainsOneHot .append (onehot )
5455 # print(SkillDomainsOneHot)
5556
56-
5757 return skills , SkillDomainsOneHot
5858
59+
5960def get_user_names (user_skill_dict ):
6061 UserNames = []
61- for username ,skill_list in user_skill_dict .items ():
62+ for username , skill_list in user_skill_dict .items ():
6263 UserNames .append (username )
6364 return UserNames
6465
66+
6567def create_models (UserSkills , UserDomains ):
6668 user_skill_features = csr_matrix (UserSkills )
6769 user_domain_features = csr_matrix (UserDomains )
6870
69- skill_based_model = NearestNeighbors (metric = 'cosine' ,n_neighbors = 10 , n_jobs = - 1 )
70- domain_based_model = NearestNeighbors (metric = 'cosine' ,n_neighbors = 10 , n_jobs = - 1 )
71+ skill_based_model = NearestNeighbors (
72+ metric = 'cosine' , n_neighbors = 10 , n_jobs = - 1 )
73+ domain_based_model = NearestNeighbors (
74+ metric = 'cosine' , n_neighbors = 10 , n_jobs = - 1 )
7175
7276 skill_based_model .fit (user_skill_features )
7377 domain_based_model .fit (user_domain_features )
7478
75- joblib .dump (skill_based_model ,'KNN_user_skills.pkl' )
76- joblib .dump (domain_based_model ,'KNN_user_domains.pkl' )
79+ joblib .dump (skill_based_model , 'KNN_user_skills.pkl' )
80+ joblib .dump (domain_based_model , 'KNN_user_domains.pkl' )
7781
7882
7983def save_usernames_insequence (usernames ):
80- with open ('usernames.txt' , 'w' ) as f :
84+ with open ('usernames.txt' , 'w' , encoding = "utf-8" ) as f :
8185 for username in usernames :
8286 f .write (username + '\n ' )
8387
88+
8489def read_usernames_insequence ():
85- with open ('usernames.txt' , 'r' ) as f :
86- usernames = list (map (lambda x :x .strip ('\n ' ),f .readlines ()))
90+ with open ('usernames.txt' , 'r' , encoding = "utf-8" ) as f :
91+ usernames = list (map (lambda x : x .strip ('\n ' ), f .readlines ()))
8792 return usernames
8893
94+
8995def user_data_matrix (user_skill_dict , Allskills ):
90- # user_skill_dict = sorted(user_skill_dict)
96+ # user_skill_dict = sorted(user_skill_dict)
9197 UserSkills = []
9298 UserNames = []
93- for username ,skill_list in user_skill_dict .items ():
94- if (username != None ):
99+ for username , skill_list in user_skill_dict .items ():
100+ if (username != None ):
95101 UserNames .append (username .lower ())
96102 oneHotSkillList = []
97103 for skill in Allskills :
@@ -100,16 +106,18 @@ def user_data_matrix(user_skill_dict, Allskills):
100106 else :
101107 oneHotSkillList .append (0 )
102108 UserSkills .append (oneHotSkillList )
103- #write usernames in same sequence to text file and read also from text file
109+ # write usernames in same sequence to text file and read also from text file
104110 save_usernames_insequence (UserNames )
105111 return UserSkills
106112
113+
107114def weights (UserSkills , SkillDomains ):
108- UserSkills = np .array (UserSkills ,dtype = np .float64 )
109- SkillDomains = np .array (SkillDomains ,dtype = np .float64 )
115+ UserSkills = np .array (UserSkills , dtype = np .float64 )
116+ SkillDomains = np .array (SkillDomains , dtype = np .float64 )
110117 UserDomains = np .dot (UserSkills , SkillDomains )
111118 return UserDomains
112119
120+
113121def get_target_user_data (target_user_skills ):
114122 target_skills = []
115123 df_skill_n_domain = pd .read_csv ('skill_n_domain.csv' )
@@ -119,18 +127,22 @@ def get_target_user_data(target_user_skills):
119127 target_skills .append (1 )
120128 else :
121129 target_skills .append (0 )
122- target_skills = np .array (target_skills ,dtype = np .float64 )
123- SkillDomains = np .array (df_skill_n_domain .iloc [:,1 :].values ,dtype = np .float64 )
130+ target_skills = np .array (target_skills , dtype = np .float64 )
131+ SkillDomains = np .array (
132+ df_skill_n_domain .iloc [:, 1 :].values , dtype = np .float64 )
124133 target_domains = np .dot (target_skills , SkillDomains )
125134 return target_skills , target_domains
126135
136+
127137def recommendUsers (target_user_skills , target_user_domains , UserNames ):
128138 skill_based_model = joblib .load ('KNN_user_skills.pkl' , mmap_mode = 'r' )
129139 domain_based_model = joblib .load ('KNN_user_domains.pkl' , mmap_mode = 'r' )
130140
131- #TODO: See this return distances parameter and how to access these distances
132- skills_based_similar_user_distances , skills_based_similar_users = skill_based_model .kneighbors ([target_user_skills ],10 )
133- domains_based_similar_user_distances , domains_based_similar_users = domain_based_model .kneighbors ([target_user_domains ],10 )
141+ # TODO: See this return distances parameter and how to access these distances
142+ skills_based_similar_user_distances , skills_based_similar_users = skill_based_model .kneighbors ([
143+ target_user_skills ], 10 )
144+ domains_based_similar_user_distances , domains_based_similar_users = domain_based_model .kneighbors ([
145+ target_user_domains ], 10 )
134146
135147 # skill_based_user_names = []
136148 # domain_based_user_names = []
@@ -140,7 +152,7 @@ def recommendUsers(target_user_skills, target_user_domains, UserNames):
140152 # skill_based_user_names.append(UserNames[usr_indx])
141153 # for usr_indx in domains_based_similar_users[0]:
142154 # domain_based_user_names.append(UserNames[usr_indx])
143-
155+
144156 Suggestions = list ()
145157 for usr_indx in skills_based_similar_users [0 ]:
146158 if (UserNames [usr_indx ] not in Suggestions ):
@@ -150,24 +162,30 @@ def recommendUsers(target_user_skills, target_user_domains, UserNames):
150162 Suggestions .append (UserNames [usr_indx ])
151163 return Suggestions
152164
153- #=======================================================================================================================
165+ # =======================================================================================================================
154166# MAIN LOGIC
155- #=======================================================================================================================
167+ # =======================================================================================================================
168+
169+
156170def update_models (skill_domain_dict , user_skill_dict ):
157171 skills , SkillDomains = get_skills_n_domains (skill_domain_dict )
158- UserSkills = user_data_matrix (user_skill_dict , skills ) #user-skill-data-matrix
172+ UserSkills = user_data_matrix (
173+ user_skill_dict , skills ) # user-skill-data-matrix
159174 UserDomains = weights (UserSkills , SkillDomains )
160175 create_models (UserSkills , UserDomains )
161- #add return statement to see if everything went down properly
176+ # add return statement to see if everything went down properly
177+
162178
163179def predict (target_user_skills ):
164- """
165- input: target_user_skills - dict{"username": [skill1, skill2, ...]}
166- """
167- UserNames = read_usernames_insequence () #[1]
168- encoded_target_user_skills , encoded_target_user_domains = get_target_user_data (target_user_skills )
169- suggestions = recommendUsers (encoded_target_user_skills , encoded_target_user_domains , UserNames )
170- return suggestions
180+ """
181+ input: target_user_skills - dict{"username": [skill1, skill2, ...]}
182+ """
183+ UserNames = read_usernames_insequence () # [1]
184+ encoded_target_user_skills , encoded_target_user_domains = get_target_user_data (
185+ target_user_skills )
186+ suggestions = recommendUsers (
187+ encoded_target_user_skills , encoded_target_user_domains , UserNames )
188+ return suggestions
171189
172190
173191"""
0 commit comments