-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmatching.py
More file actions
184 lines (152 loc) · 7.02 KB
/
matching.py
File metadata and controls
184 lines (152 loc) · 7.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
from resume_parser import Resume_Parser
from job_parser import JdParser
import multiprocessing as mp
import os
import pprint
import json
def to_percentage(value):
percentage = value * 100
return "{:.2f}".format(percentage)
class MatchingEngine:
def __init__(self, job_skills, candidates):
self.job_skills = [skill.lower() for skill in job_skills]
self.resumes_skills = [[skill.lower() for skill in candidate['skills']] for candidate in candidates]
self.candidates = candidates
def simple_intersection_score(self):
rank = []
for index, skills in enumerate(self.resumes_skills):
job_skills_set = set(self.job_skills)
resume_skills_set = set(skills)
common_skills = job_skills_set.intersection(resume_skills_set)
score = len(common_skills) / len(job_skills_set) if job_skills_set else 0
rank.append({'name': self.candidates[index]['name'], 'score': to_percentage(score)})
return rank
def cosine_similarity_with_tfidf(self):
rank = []
for index, skills in enumerate(self.resumes_skills):
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([" ".join(self.job_skills), " ".join(skills)])
cosine_sim = cosine_similarity(vectors[0:1], vectors[1:2])
score = cosine_sim[0, 0]
rank.append({'name': self.candidates[index]['name'], 'score': to_percentage(score)})
return rank
def jaccard_similarity_score(self):
rank = []
for index, skills in enumerate(self.resumes_skills):
job_skills_set = set(self.job_skills)
resume_skills_set = set(skills)
intersection = job_skills_set.intersection(resume_skills_set)
union = job_skills_set.union(resume_skills_set)
score = len(intersection) / len(union) if union else 0
rank.append({'name': self.candidates[index]['name'], 'score': to_percentage(score)})
return rank
def matching_result_wrapper(jd, resumes: list[str]):
parser = MatchingEngine(jd, resumes)
return parser.simple_intersection_score()
def cosine_similarity_with_tfidf(job_skills, candidate_skills):
vectorizer = TfidfVectorizer()
job_skills_text = " ".join(job_skills)
candidate_skills_text = " ".join(candidate_skills)
vectors = vectorizer.fit_transform([job_skills_text, candidate_skills_text])
cosine_sim = cosine_similarity(vectors[0:1], vectors[1:2])
score = cosine_sim[0, 0]
return score
def jaccard_similarity_score(job_skills, candidate_skills):
job_skills_set = set(job_skills) # Convert job skills to a set
candidate_skills_set = set(candidate_skills) # Convert candidate's skills to a set
intersection = job_skills_set.intersection(candidate_skills_set)
union = job_skills_set.union(candidate_skills_set)
score = len(intersection) / len(union) if union else 0
return score
def compare_profiles_with_expert(data):
subject_skills = set(data["subjectData"]["recommendedSkills"])
expert_skills = set(data["expertData"]["skills"])
candidate_skills = [set(candidate["skills"]) for candidate in data["candidateData"]]
# Aggregate all candidate skills
aggregated_candidate_skills = set().union(*candidate_skills)
# Calculate Profile Score
profile_score = len(expert_skills.intersection(aggregated_candidate_skills)) / len(aggregated_candidate_skills) * 100
# Calculate Relevancy Score
job_match_score = len(expert_skills.intersection(subject_skills)) / len(subject_skills) * 100
relevancy_score = (0.6 * profile_score) + (0.4 * job_match_score)
results = []
for candidate in data["candidateData"]:
intersection_score = len(subject_skills.intersection(set(candidate["skills"]))) / len(subject_skills) * 100
cosine_score = cosine_similarity_with_tfidf(
data["subjectData"]["recommendedSkills"],
candidate["skills"]
) * 100
jaccard_score_value = jaccard_score = jaccard_similarity_score(
data["subjectData"]["recommendedSkills"],
candidate["skills"]
) * 100
overall_similarity = (intersection_score + cosine_score + jaccard_score_value) / 3
results.append({
"name": candidate["name"],
"intersection_score": round(intersection_score, 2),
"cosine_similarity": round(cosine_score, 2),
"jaccard_similarity": round(jaccard_score_value, 2),
"overall_similarity": round(overall_similarity, 2)
})
return {
"profile_score": round(profile_score, 2),
"relevancy_score": round(relevancy_score, 2),
"candidates": results
}
def compare_profiles_with_board(data):
subject_skills = set(data["subjectData"]["recommendedSkills"])
candidate_skills = set(data["candidateData"]["skills"])
aggregated_candidate_skills = candidate_skills
relevancy_score = len(subject_skills.intersection(aggregated_candidate_skills)) / len(subject_skills) * 100
intersection_score = len(subject_skills.intersection(candidate_skills)) / len(subject_skills) * 100
cosine_score = cosine_similarity_with_tfidf(
data["subjectData"]["recommendedSkills"],
data["candidateData"]["skills"]
)
jaccard_score = jaccard_similarity_score(
data["subjectData"]["recommendedSkills"],
data["candidateData"]["skills"]
)
overall_similarity = (intersection_score + cosine_score * 100 + jaccard_score * 100) / 3
result = {
"name": data["candidateData"]["name"],
"intersection_score": round(intersection_score, 2),
"cosine_similarity": round(cosine_score * 100, 2),
"jaccard_similarity": round(jaccard_score * 100, 2),
"overall_similarity": round(overall_similarity, 2)
}
return {
"relevancy_score": round(relevancy_score, 2),
"candidate": result
}
if __name__ == '__main__':
pool = mp.Pool(mp.cpu_count())
resumes = []
jds = []
for root, directories, filenames in os.walk('files/res/pdf'):
for filename in filenames:
file = os.path.join(root, filename)
resumes.append(file)
for root, directories, filenames in os.walk('files/jd/pdf'):
for filename in filenames:
file = os.path.join(root, filename)
jds.append(file)
if not jds:
print("No job description files found in 'files/jd/pdf'. Exiting.")
elif not resumes:
print("No resume files found in 'files/res/pdf'. Exiting.")
else:
# Perform matching only if both JDs and resumes are available
results = [
pool.apply_async(
matching_result_wrapper,
args=(jds[0], resumes)
)
]
# Process JSON Input
#result = compare_profiles_with_expert(json_input)
# results = [p.get() for p in results]
pprint.pprint(results)