Skip to content

Commit 707ce11

Browse files
Merge pull request #318 from mkswagger/main
[Hacktoberfest] Added Resume Parser 🌟
2 parents 600cf90 + 26f9d0b commit 707ce11

File tree

8 files changed

+423
-0
lines changed

8 files changed

+423
-0
lines changed
2.11 KB
Binary file not shown.

resume_parser/info_extractor.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import pandas as pd
2+
import os
3+
from typing import *
4+
from tika import parser
5+
import gensim
6+
import docx
7+
import re
8+
9+
class InfoExtractor:
10+
@staticmethod
11+
def extractskills(filename)-> List[str]:
12+
try:
13+
text = getDocumentText(filename,parser)
14+
except:
15+
print("Error in File :" + filename)
16+
text=""
17+
18+
data = pd.read_csv(
19+
os.path.join(os.path.dirname(_file_), "skills.csv")
20+
)
21+
skills = list(data.columns.values)
22+
skillset = []
23+
24+
for token in gensim.utils.simple_preprocess(text):
25+
if token in skills:
26+
skillset.append(token)
27+
return [i.capitalize() for i in set([i.lower() for i in skillset])]
28+
29+
30+
def getDocumentText(f: str, parser) -> Optional[str]:
31+
if f.endswith(".pdf"):
32+
new_text = getPDFText(f, parser)
33+
elif f.endswith(".docx"):
34+
new_text = getDocxText(f)
35+
else:
36+
return None, None
37+
38+
try:
39+
new_text = re.sub("\n{3,}", "\n", new_text)
40+
new_text = str(bytes(new_text, "utf-8").replace(b"\xe2\x80\x93", b""), "utf-8")
41+
except:
42+
print('Error in Reading File: ' + f)
43+
new_text = ''
44+
return new_text
45+
46+
def getDocxText(filename: str) -> str:
47+
doc = docx.Document(filename)
48+
fullText = []
49+
for para in doc.paragraphs:
50+
txt = para.text
51+
fullText.append(txt)
52+
return "\n".join(fullText)
53+
54+
def getPDFText(filename: str, parser) -> str:
55+
raw = parser.from_file(filename)
56+
new_text = raw["content"]
57+
if "title" in raw["metadata"]:
58+
title = raw["metadata"]["title"]
59+
new_text = new_text.replace(title, "")
60+
return new_text

resume_parser/ml_loader.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
import shutil
2+
3+
from info_extractor import InfoExtractor
4+
import os
5+
import pandas as pd
6+
from sklearn.feature_extraction.text import CountVectorizer
7+
from sklearn.linear_model import LogisticRegression
8+
import numpy as np
9+
10+
testAlgo = LogisticRegression(solver='lbfgs', multi_class='auto')
11+
12+
trainResumePathDictionary = {}
13+
trainResumeSkillsDictionary = {}
14+
trainY = []
15+
resumeBaseUrl = "training-data/"
16+
processingSet = ['FE', 'BE', 'DevOps']
17+
dataFrameDictionary = {}
18+
19+
try:
20+
for currentSet in processingSet:
21+
currentPath = resumeBaseUrl + currentSet
22+
trainResumePathDictionary[currentSet] = [os.path.join(currentPath, f) for f in os.listdir(currentPath) if os.path.isfile(os.path.join(currentPath, f))]
23+
except:
24+
print('Error')
25+
pass
26+
27+
28+
resumeVectorizer = CountVectorizer()
29+
30+
def prepareResumeNameAsIndex(resumesList):
31+
indexes = {}
32+
for i in range(len(resumesList)):
33+
indexes[i] = resumesList[i].split("/")[len(resumesList[i].split("/")) - 1]
34+
return indexes
35+
36+
37+
def prepareOutputClassesForTrainingSet(currentSet):
38+
if currentSet == 'FE':
39+
trainY.append(0)
40+
elif currentSet == 'BE':
41+
trainY.append(1)
42+
elif currentSet == 'QA':
43+
trainY.append(2)
44+
elif currentSet == 'DevOps':
45+
trainY.append(3)
46+
47+
48+
def extractTrainingText(resumes, currentSet):
49+
countFilesRead = 0
50+
trainResumeSkillsDictionary[currentSet] = []
51+
tempSplittedTextForDataFrame = []
52+
tempSplittedTextContainerForDataFrame = []
53+
currentResumeDataFrame = {}
54+
for currentResume in resumes:
55+
countFilesRead += 1
56+
if countFilesRead % 100 == 0:
57+
print("Resumes Read for " + currentSet + " = " + str(countFilesRead))
58+
tempSplittedTextForDataFrame = InfoExtractor.extractSkills(currentResume)
59+
tempSplittedTextContainerForDataFrame.append(tempSplittedTextForDataFrame)
60+
individualResumeSkills = " ".join(tempSplittedTextForDataFrame)
61+
trainResumeSkillsDictionary[currentSet].append(individualResumeSkills)
62+
prepareOutputClassesForTrainingSet(currentSet)
63+
currentResumeDataFrame = pd.DataFrame(tempSplittedTextContainerForDataFrame)
64+
tempSplittedTextContainerForDataFrame = []
65+
tempSplittedTextForDataFrame = []
66+
currentResumeDataFrame.rename(index=prepareResumeNameAsIndex(trainResumePathDictionary[currentSet]), inplace=True)
67+
return currentResumeDataFrame
68+
69+
70+
def trainDataSet():
71+
for currentSet in processingSet:
72+
dataFrameDictionary[currentSet] = extractTrainingText(trainResumePathDictionary[currentSet], currentSet)
73+
print('----------Extraction completed for dataset: ' + currentSet + '------------')
74+
75+
76+
def fetchValuesForTraining(currentDataset):
77+
tempSkillsToTrainSet = []
78+
for currentSet in processingSet:
79+
tempSkillsToTrainSet += currentDataset[currentSet]
80+
return tempSkillsToTrainSet
81+
82+
83+
def normalizeLanguageForMachine():
84+
Resume_Vector = []
85+
normalizedData = []
86+
87+
skillsToTrain = fetchValuesForTraining(trainResumeSkillsDictionary)
88+
resumeVectorizer.fit(skillsToTrain)
89+
90+
for text in skillsToTrain:
91+
vector = resumeVectorizer.transform([text])
92+
Resume_Vector.append(vector.toarray())
93+
94+
for x in Resume_Vector:
95+
normalizedData.append(x[0])
96+
97+
return normalizedData
98+
99+
100+
def classifyTestedResumes(testResumes, predictedResumes):
101+
resultDestinationBaseUrl = "result/resumes/"
102+
namesOnly = []
103+
predictedNames = []
104+
for i in range(len(testResumes)):
105+
namesOnly.append(testResumes[i].split("/")[len(testResumes[i].split("/")) - 1])
106+
for i in range(len(predictedResumes)):
107+
currentName = namesOnly[i].split("\\")[len(testResumes[i].split("\\")) - 1]
108+
if predictedResumes[i] == 0:
109+
classifyResumesInFolders(testResumes[i], resultDestinationBaseUrl + 'FE/' + currentName)
110+
predictedNames.append("Front End Resume")
111+
elif predictedResumes[i] == 1:
112+
classifyResumesInFolders(testResumes[i], resultDestinationBaseUrl + 'BE/' + currentName)
113+
predictedNames.append("Back End Resume")
114+
elif predictedResumes[i] == 2:
115+
classifyResumesInFolders(testResumes[i], resultDestinationBaseUrl + 'QA/' + currentName)
116+
predictedNames.append("QA Resume")
117+
elif predictedResumes[i] == 3:
118+
classifyResumesInFolders(testResumes[i], resultDestinationBaseUrl + 'DevOps/' + currentName)
119+
predictedNames.append("DevOps Resume")
120+
return namesOnly, predictedNames
121+
122+
123+
def testAndClassifyResumes():
124+
resumePathTest = "uploads"
125+
testResumes = [os.path.join(resumePathTest, f) for f in os.listdir(resumePathTest) if
126+
os.path.isfile(os.path.join(resumePathTest, f))]
127+
skillsToTrainTest = []
128+
testResume = ""
129+
for testResume in testResumes:
130+
testSkills = InfoExtractor.extractSkills(testResume)
131+
skillsToTrainTest.append(" ".join(testSkills))
132+
newArrayToPredict = resumeVectorizer.transform(skillsToTrainTest).toarray()
133+
predictedResumes = testAlgo.predict(newArrayToPredict)
134+
return classifyTestedResumes(testResumes, predictedResumes)
135+
136+
137+
def trainMachineLearningAlgorithm(normalizedDataForProcessing, trainY):
138+
trainX = np.array(normalizedDataForProcessing)
139+
trainY = np.array(trainY)
140+
trainY = trainY.reshape(-1, 1)
141+
testAlgo.fit(trainX, trainY)
142+
print(trainX.shape)
143+
print(trainY.shape)
144+
145+
146+
# "src/data/test/resumes/export_dataframe.csv"
147+
def getTrainingDataFromCSV(file):
148+
trainingSetFromCSV = pd.read_csv(file)
149+
trainYFromFile = np.array(trainingSetFromCSV['outputClass']).reshape(-1, 1)
150+
trainXFromFile = np.array(trainingSetFromCSV.drop(columns=['outputClass']).values.tolist())
151+
print(trainYFromFile.shape)
152+
print(trainXFromFile.shape)
153+
return trainXFromFile, trainYFromFile, trainingSetFromCSV
154+
155+
156+
def normalizeDataAndWriteToFile(file):
157+
normalizedDataForProcessing = normalizeLanguageForMachine()
158+
TransformedResumesData = pd.DataFrame(normalizedDataForProcessing)
159+
TransformedResumesData = TransformedResumesData.join(pd.DataFrame({'outputClass': trainY}))
160+
print(TransformedResumesData.shape)
161+
TransformedResumesData.rename(index=prepareResumeNameAsIndex(fetchValuesForTraining(trainResumePathDictionary)),
162+
inplace=True)
163+
# TransformedResumesData.columns = resumeVectorizer.get_feature_names()
164+
print(TransformedResumesData.shape)
165+
export_csv = TransformedResumesData.to_csv(file, index=None, header=True)
166+
return normalizedDataForProcessing
167+
168+
def classifyResumesInFolders(source, destination):
169+
if not os.path.exists(destination.rsplit('/', 1)[0]):
170+
os.makedirs(destination.rsplit('/', 1)[0])
171+
dest = shutil.copyfile(source, destination)
172+
173+
174+
trainDataSet()
175+
176+
normalizedDataForProcessing = normalizeDataAndWriteToFile('training-data/training_data_for_resumes.csv')
177+
178+
trainMachineLearningAlgorithm(normalizedDataForProcessing, trainY)
179+
180+
181+
182+
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# resume_scanner
2+
program that scans input text files for specified phrase and returns list of files with the same.
3+
4+
5+
takes file locations and phrase as arguments
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/*
2+
Name: SUDHANVA RAJESH
3+
SRN: PES1UG19CS512
4+
SEC: Q
5+
ROLL NUMBER: 58
6+
SUBJECT:PSWC
7+
SUBJECT CODE:UE19CS151
8+
TEACHER IN CHARGE: SUPREETHA S
9+
10+
Some requirements for the program to work:
11+
1.In the command line arguments passed,the complete location of the files in which the pattern has to be searched (for example C:\Users\sudhanva\Desktop\pswc_project\test1.txt),must be entered properly,without any mistake,if the file is present in a different folder, if the file is located in the same folder as the program, only mentioning the name of the file i.e. file1.txt for example is sufficient
12+
2.A maximum of 20 files entered can contain required pattern,this can be increased,but care must be taken to increase the size of the files array declared in the main function.
13+
3.The pattern can be 10 characters long,this can be increased by increasing the size of the pattern array,and the temp_pattern array declared in the main function.
14+
4.In the files passed,the last character of the file must be followed by a tab space,so that the program can recognize the end of the file, multi line files are accepted.
15+
5.The files passed,must not have any tab spaces,at any part of the file other than the end of the file.This is the only problem with the program i was not able to overcome.
16+
6.First command line argument=first pattern, Second command line argument=Second pattern, Third command line argument=conditional operator, Fourth command line argument onwards=files
17+
*/
18+
19+
#include"header.h"
20+
#include<stdio.h>
21+
#include<string.h>
22+
int search(char pattern[10],char filecontent[30])//function to check if given pattern is present in the file passed,returns non zero if found.
23+
{
24+
25+
if(strstr(filecontent,pattern)!=NULL)//at this step,if the pattern is found,the char array pattern points to the location at which the pattern was found in the contents of the file,hence we need to reassign it to the required pattern at every iteration
26+
{
27+
return 1;
28+
}
29+
else
30+
{
31+
return 0;
32+
}
33+
}
34+
35+
int operator_check(char op[4])
36+
{
37+
if((op[0]=='O'&&op[1]=='R')||(op[0]=='o'&&op[1]=='r'))
38+
{
39+
return 1;
40+
}
41+
else if((op[0]=='A'&&op[1]=='N'&&op[2]=='D')||(op[0]=='a'&&op[1]=='n'&&op[2]=='d'))
42+
{
43+
return 2;
44+
}
45+
else {
46+
return 0;
47+
}
48+
}
49+
50+
51+
52+
void files_with_pattern(int counter,char *files[20])
53+
{
54+
int i;
55+
printf("---------------------------------------------------\n");
56+
printf("Number of files containing the given pattern= %d\n",counter);
57+
printf("---------------------------------------------------\n");
58+
printf("list of the location of files containing given pattern:\n");
59+
for(i=0;i<counter;i++)
60+
{
61+
printf("file %d = %s\n",i+1,files[i]);
62+
}
63+
printf("---------------------------------------------------\n");
64+
65+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
int search(char pattern[10],char filecontent[30]);
2+
int operator_check(char op[4]);
3+
void files_with_pattern(int counter,char *files[20]);

0 commit comments

Comments
 (0)