kishanrajput23
diff --git a/‎resume_parser/__pycache__/info_extractor.cpython-310.pyc‎
2.11 KB b/‎resume_parser/__pycache__/info_extractor.cpython-310.pyc‎
2.11 KB
diff --git a/‎resume_parser/info_extractor.py‎
Lines changed: 60 additions & 0 deletions b/‎resume_parser/info_extractor.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎resume_parser/ml_loader.py‎
Lines changed: 182 additions & 0 deletions b/‎resume_parser/ml_loader.py‎
Lines changed: 182 additions & 0 deletions
diff --git a/‎resume_parser/resume_phrasematch/README.md‎
Lines changed: 5 additions & 0 deletions b/‎resume_parser/resume_phrasematch/README.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎resume_parser/resume_phrasematch/function.c‎
Lines changed: 65 additions & 0 deletions b/‎resume_parser/resume_phrasematch/function.c‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎resume_parser/resume_phrasematch/header.h‎
Lines changed: 3 additions & 0 deletions b/‎resume_parser/resume_phrasematch/header.h‎
Lines changed: 3 additions & 0 deletions
@@ -0,0 +1,60 @@
+import pandas as pd
+import os 
+from typing import *
+from tika import parser
+import gensim
+import docx
+import re
+
+class InfoExtractor:
+    @staticmethod
+    def extractskills(filename)-> List[str]:
+        try:
+            text = getDocumentText(filename,parser)
+        except:
+            print("Error in File :" + filename)
+            text=""
+        
+        data = pd.read_csv(
+            os.path.join(os.path.dirname(_file_), "skills.csv")
+        )
+        skills = list(data.columns.values)
+        skillset = []
+
+        for token in gensim.utils.simple_preprocess(text):
+            if token in skills:
+                skillset.append(token)
+        return [i.capitalize() for i in set([i.lower() for i in skillset])]
+    
+
+def getDocumentText(f: str, parser) -> Optional[str]:
+    if f.endswith(".pdf"):
+        new_text = getPDFText(f, parser)
+    elif f.endswith(".docx"):
+        new_text = getDocxText(f)
+    else:
+        return None, None
+	
+    try:
+        new_text = re.sub("\n{3,}", "\n", new_text)
+        new_text = str(bytes(new_text, "utf-8").replace(b"\xe2\x80\x93", b""), "utf-8")
+    except:
+        print('Error in Reading File: ' + f)
+        new_text = ''
+    return new_text
+
+def getDocxText(filename: str) -> str:
+    doc = docx.Document(filename)
+    fullText = []
+    for para in doc.paragraphs:
+        txt = para.text
+        fullText.append(txt)
+    return "\n".join(fullText)
+
+def getPDFText(filename: str, parser) -> str:
+    raw = parser.from_file(filename)
+    new_text = raw["content"]
+    if "title" in raw["metadata"]:
+        title = raw["metadata"]["title"]
+        new_text = new_text.replace(title, "")
+    return new_text
@@ -0,0 +1,182 @@
+import shutil
+
+from info_extractor import InfoExtractor
+import os
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.linear_model import LogisticRegression
+import numpy as np
+
+testAlgo = LogisticRegression(solver='lbfgs', multi_class='auto')
+
+trainResumePathDictionary = {}
+trainResumeSkillsDictionary = {}
+trainY = []
+resumeBaseUrl = "training-data/"
+processingSet = ['FE', 'BE', 'DevOps']
+dataFrameDictionary = {}
+
+try:
+    for currentSet in processingSet:
+        currentPath = resumeBaseUrl + currentSet
+        trainResumePathDictionary[currentSet] = [os.path.join(currentPath, f) for f in os.listdir(currentPath) if os.path.isfile(os.path.join(currentPath, f))]
+except:
+    print('Error')
+    pass
+
+
+resumeVectorizer = CountVectorizer()
+
+def prepareResumeNameAsIndex(resumesList):
+    indexes = {}
+    for i in range(len(resumesList)):
+        indexes[i] = resumesList[i].split("/")[len(resumesList[i].split("/")) - 1]
+    return indexes
+
+
+def prepareOutputClassesForTrainingSet(currentSet):
+    if currentSet == 'FE':
+        trainY.append(0)
+    elif currentSet == 'BE':
+        trainY.append(1)
+    elif currentSet == 'QA':
+        trainY.append(2)
+    elif currentSet == 'DevOps':
+        trainY.append(3)
+
+
+def extractTrainingText(resumes, currentSet):
+    countFilesRead = 0
+    trainResumeSkillsDictionary[currentSet] = []
+    tempSplittedTextForDataFrame = []
+    tempSplittedTextContainerForDataFrame = []
+    currentResumeDataFrame = {}
+    for currentResume in resumes:
+        countFilesRead += 1
+        if countFilesRead % 100 == 0:
+            print("Resumes Read for " + currentSet + " = " + str(countFilesRead))
+        tempSplittedTextForDataFrame = InfoExtractor.extractSkills(currentResume)
+        tempSplittedTextContainerForDataFrame.append(tempSplittedTextForDataFrame)
+        individualResumeSkills = " ".join(tempSplittedTextForDataFrame)
+        trainResumeSkillsDictionary[currentSet].append(individualResumeSkills)
+        prepareOutputClassesForTrainingSet(currentSet)
+    currentResumeDataFrame = pd.DataFrame(tempSplittedTextContainerForDataFrame)
+    tempSplittedTextContainerForDataFrame = []
+    tempSplittedTextForDataFrame = []
+    currentResumeDataFrame.rename(index=prepareResumeNameAsIndex(trainResumePathDictionary[currentSet]), inplace=True)
+    return currentResumeDataFrame
+
+
+def trainDataSet():
+    for currentSet in processingSet:
+        dataFrameDictionary[currentSet] = extractTrainingText(trainResumePathDictionary[currentSet], currentSet)
+        print('----------Extraction completed for dataset: ' + currentSet + '------------')
+
+
+def fetchValuesForTraining(currentDataset):
+    tempSkillsToTrainSet = []
+    for currentSet in processingSet:
+        tempSkillsToTrainSet += currentDataset[currentSet]
+    return tempSkillsToTrainSet
+
+
+def normalizeLanguageForMachine():
+    Resume_Vector = []
+    normalizedData = []
+
+    skillsToTrain = fetchValuesForTraining(trainResumeSkillsDictionary)
+    resumeVectorizer.fit(skillsToTrain)
+
+    for text in skillsToTrain:
+        vector = resumeVectorizer.transform([text])
+        Resume_Vector.append(vector.toarray())
+
+    for x in Resume_Vector:
+        normalizedData.append(x[0])
+
+    return normalizedData
+
+
+def classifyTestedResumes(testResumes, predictedResumes):
+    resultDestinationBaseUrl = "result/resumes/"
+    namesOnly = []
+    predictedNames = []
+    for i in range(len(testResumes)):
+        namesOnly.append(testResumes[i].split("/")[len(testResumes[i].split("/")) - 1])
+    for i in range(len(predictedResumes)):
+        currentName = namesOnly[i].split("\\")[len(testResumes[i].split("\\")) - 1]
+        if predictedResumes[i] == 0:
+            classifyResumesInFolders(testResumes[i], resultDestinationBaseUrl + 'FE/' + currentName)
+            predictedNames.append("Front End Resume")
+        elif predictedResumes[i] == 1:
+            classifyResumesInFolders(testResumes[i], resultDestinationBaseUrl + 'BE/' + currentName)
+            predictedNames.append("Back End Resume")
+        elif predictedResumes[i] == 2:
+            classifyResumesInFolders(testResumes[i], resultDestinationBaseUrl + 'QA/' + currentName)
+            predictedNames.append("QA Resume")
+        elif predictedResumes[i] == 3:
+            classifyResumesInFolders(testResumes[i], resultDestinationBaseUrl + 'DevOps/' + currentName)
+            predictedNames.append("DevOps Resume")
+    return namesOnly, predictedNames
+
+
+def testAndClassifyResumes():
+    resumePathTest = "uploads"
+    testResumes = [os.path.join(resumePathTest, f) for f in os.listdir(resumePathTest) if
+                   os.path.isfile(os.path.join(resumePathTest, f))]
+    skillsToTrainTest = []
+    testResume = ""
+    for testResume in testResumes:
+        testSkills = InfoExtractor.extractSkills(testResume)
+        skillsToTrainTest.append(" ".join(testSkills))
+    newArrayToPredict = resumeVectorizer.transform(skillsToTrainTest).toarray()
+    predictedResumes = testAlgo.predict(newArrayToPredict)
+    return classifyTestedResumes(testResumes, predictedResumes)
+
+
+def trainMachineLearningAlgorithm(normalizedDataForProcessing, trainY):
+    trainX = np.array(normalizedDataForProcessing)
+    trainY = np.array(trainY)
+    trainY = trainY.reshape(-1, 1)
+    testAlgo.fit(trainX, trainY)
+    print(trainX.shape)
+    print(trainY.shape)
+
+
+#     "src/data/test/resumes/export_dataframe.csv"
+def getTrainingDataFromCSV(file):
+    trainingSetFromCSV = pd.read_csv(file)
+    trainYFromFile = np.array(trainingSetFromCSV['outputClass']).reshape(-1, 1)
+    trainXFromFile = np.array(trainingSetFromCSV.drop(columns=['outputClass']).values.tolist())
+    print(trainYFromFile.shape)
+    print(trainXFromFile.shape)
+    return trainXFromFile, trainYFromFile, trainingSetFromCSV
+
+
+def normalizeDataAndWriteToFile(file):
+    normalizedDataForProcessing = normalizeLanguageForMachine()
+    TransformedResumesData = pd.DataFrame(normalizedDataForProcessing)
+    TransformedResumesData = TransformedResumesData.join(pd.DataFrame({'outputClass': trainY}))
+    print(TransformedResumesData.shape)
+    TransformedResumesData.rename(index=prepareResumeNameAsIndex(fetchValuesForTraining(trainResumePathDictionary)),
+                                  inplace=True)
+    # TransformedResumesData.columns = resumeVectorizer.get_feature_names()
+    print(TransformedResumesData.shape)
+    export_csv = TransformedResumesData.to_csv(file, index=None, header=True)
+    return normalizedDataForProcessing
+
+def classifyResumesInFolders(source, destination):
+    if not os.path.exists(destination.rsplit('/', 1)[0]):
+        os.makedirs(destination.rsplit('/', 1)[0])
+    dest = shutil.copyfile(source, destination)
+
+
+trainDataSet()
+
+normalizedDataForProcessing = normalizeDataAndWriteToFile('training-data/training_data_for_resumes.csv')
+
+trainMachineLearningAlgorithm(normalizedDataForProcessing, trainY)
+
+
+
+
@@ -0,0 +1,5 @@
+# resume_scanner
+program that scans input text files for specified phrase and returns list of files with the same.
+
+
+takes file locations and phrase as arguments
@@ -0,0 +1,65 @@
+/*
+Name: SUDHANVA RAJESH
+SRN: PES1UG19CS512
+SEC: Q
+ROLL NUMBER: 58
+SUBJECT:PSWC
+SUBJECT CODE:UE19CS151
+TEACHER IN CHARGE: SUPREETHA S
+
+Some requirements for the program to work:
+1.In the command line arguments passed,the complete location of the files in which the pattern has to be searched (for example C:\Users\sudhanva\Desktop\pswc_project\test1.txt),must be entered properly,without any mistake,if the file is present in a different folder, if the file is located in the same folder as the program, only mentioning the name of the file i.e. file1.txt for example is sufficient
+2.A maximum of 20 files entered can contain required pattern,this can be increased,but care must be taken to increase the size of the files array declared in the main function.
+3.The pattern can be 10 characters long,this can be increased by increasing the size of the pattern array,and the temp_pattern array declared in the main function.
+4.In the files passed,the last character of the file must be followed by a tab space,so that the program can recognize the end of the file, multi line files are accepted.
+5.The files passed,must not have any tab spaces,at any part of the file other than the end of the file.This is the only problem with the program i was not able to overcome.
+6.First command line argument=first pattern, Second command line argument=Second pattern, Third command line argument=conditional operator, Fourth command line argument onwards=files
+*/
+
+#include"header.h"
+#include<stdio.h>
+#include<string.h>
+int search(char pattern[10],char filecontent[30])//function to check if given pattern is present in the file passed,returns non zero if found.
+{
+
+        if(strstr(filecontent,pattern)!=NULL)//at this step,if the pattern is found,the char array pattern points to the location at which the pattern was found in the contents of the file,hence we need to reassign it to the required pattern at every iteration
+        {
+            return 1;
+        }
+        else
+        {
+            return 0;
+        }
+}
+
+int operator_check(char op[4])
+{
+    if((op[0]=='O'&&op[1]=='R')||(op[0]=='o'&&op[1]=='r'))
+    {
+        return 1;
+    }
+    else if((op[0]=='A'&&op[1]=='N'&&op[2]=='D')||(op[0]=='a'&&op[1]=='n'&&op[2]=='d'))
+    {
+        return 2;
+    }
+    else {
+        return 0;
+    }
+}
+
+
+
+void files_with_pattern(int counter,char *files[20])
+{
+    int i;
+    printf("---------------------------------------------------\n");
+    printf("Number of files containing the given pattern= %d\n",counter);
+    printf("---------------------------------------------------\n");
+    printf("list of the location of files containing given pattern:\n");
+    for(i=0;i<counter;i++)
+     {
+         printf("file %d = %s\n",i+1,files[i]);
+     }
+    printf("---------------------------------------------------\n");
+
+}
@@ -0,0 +1,3 @@
+int search(char pattern[10],char filecontent[30]);
+int operator_check(char op[4]);
+void files_with_pattern(int counter,char *files[20]);
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+int search(char pattern[10],char filecontent[30]);`
	`2`	`+int operator_check(char op[4]);`
	`3`	`+void files_with_pattern(int counter,char *files[20]);`