Name Entity Recognition on PDF Resume using NLP and spacy¶

try:
    import spacy
    import json
except Exception as e:
    print(e)
    

class EntityGenerator(object):
    
    _slots__ = ['text']
    
    def __init__(self, text=None):
        self.text = text
        
    def get(self):
        """
        Return a Json
        """
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(self.text)
        text = [ent.text for ent in doc.ents]
        entity = [ent.label_ for ent in doc.ents]
    
        from collections import Counter
        import json

        data = Counter(zip(entity))
        unique_entity = list(data.keys())
        unique_entity = [x[0] for x in unique_entity]

        d = {}
        for val in unique_entity:
            d[val] = []

        for key,val in dict(zip(text, entity)).items():
            if val in unique_entity:
                d[val].append(key)
        return d

try:
    import PyPDF2
    import requests
    import json
except Exception:
    pass

class Resume(object):
    def __init__(self, filename=None):
        self.filename = filename
        
    def get(self):
        """
        
        """
        fFileObj = open(self.filename, 'rb')
        pdfReader = PyPDF2.PdfFileReader(fFileObj)
        pageObj = pdfReader.getPage(0)
        print("Total Pages : {} ".format(pdfReader.numPages))

        resume = pageObj.extractText()
        return resume

resume = Resume(filename="0.pdf")
response_news = resume.get()

Total Pages : 1

helper = EntityGenerator(text=response_news)
response = helper.get()
print(json.dumps(response , indent=3))

{
   "ORG": [
      "University of Bridgeport   \n                                                                                                                                                                                                                              ",
      "GPA",
      "University of Mumbai",
      "Electronic Engineering",
      "Shelton",
      "CI",
      "AWS",
      "UI",
      "University of Bridgeport",
      "JavaScript",
      "Bootstrap",
      "NASA C",
      "CT",
      "Space Grant\n              University of \nBridgeport",
      "Publicat",
      "Applications and Technology Conference",
      "Farmingdale\n"
   ],
   "CARDINAL": [
      "3.81/",
      "7/10",
      "2017",
      "50",
      "328",
      "3000",
      "600"
   ],
   "DATE": [
      "4",
      "August",
      "May 2020",
      "May 2013",
      "2019",
      "May 3",
      "10 \nYears"
   ],
   "GPE": [
      "Budderfly",
      "Bridgeport",
      "Python",
      "US",
      "YouTube",
      "Nitin",
      "-soumil/ |\n  "
   ],
   "PERCENT": [
      "10%",
      "8%",
      "80 %"
   ],
   "PERSON": [
      "onnecticut",
      "Jun",
      "Creator",
      "Machine Learning",
      "| https://github.com/soumilshah1995\n ",
      "Summary"
   ],
   "LAW": [
      "Air Pollution"
   ],
   "NORP": [
      "Query"
   ],
   "LOC": []
}

Custom Entity Training¶

This code for custom Entity i have copied from motiversity Blog use this template to train your own model and entity¶

import spacy
import random

TRAIN_DATA = [('what is the price of polo?', {'entities': [(21, 25, 'PrdName')]}), 
              ('what is the price of ball?', {'entities': [(21, 25, 'PrdName')]}), 
              ('what is the price of jegging?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of t-shirt?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of jeans?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of bat?', {'entities': [(21, 24, 'PrdName')]}), 
              ('what is the price of shirt?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of bag?', {'entities': [(21, 24, 'PrdName')]}), 
              ('what is the price of cup?', {'entities': [(21, 24, 'PrdName')]}), 
              ('what is the price of jug?', {'entities': [(21, 24, 'PrdName')]}), 
              ('what is the price of plate?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of glass?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of moniter?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of desktop?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of bottle?', {'entities': [(21, 27, 'PrdName')]}), 
              ('what is the price of mouse?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of keyboad?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of chair?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of table?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of watch?', {'entities': [(21, 26, 'PrdName')]})]

def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp


prdnlp = train_spacy(TRAIN_DATA, 20)

# Save our trained Model
modelfile = input("Enter your Model Name: ")
prdnlp.to_disk(modelfile)

#Test your text
test_text = input("Enter your testing text: ")
doc = prdnlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Statring iteration 0
{'ner': 43.70497216920921}
Statring iteration 1
{'ner': 2.710291442944125}
Statring iteration 2
{'ner': 1.996694022562065}
Statring iteration 3
{'ner': 2.1947250592188867}
Statring iteration 4
{'ner': 0.9058895410132363}
Statring iteration 5
{'ner': 1.167754612674311}
Statring iteration 6
{'ner': 6.173202133882057}
Statring iteration 7
{'ner': 3.2939778793527186}
Statring iteration 8
{'ner': 2.1191635131465687}
Statring iteration 9
{'ner': 5.807327533150592}
Statring iteration 10
{'ner': 1.1973480416004578}
Statring iteration 11
{'ner': 2.1269163827046267}
Statring iteration 12
{'ner': 2.504892537738018}
Statring iteration 13
{'ner': 1.8551057868492804}
Statring iteration 14
{'ner': 2.9309291534996826}
Statring iteration 15
{'ner': 3.1783536320935912}
Statring iteration 16
{'ner': 2.0050045232922225}
Statring iteration 17
{'ner': 5.117727673072695}
Statring iteration 18
{'ner': 2.1752474400122725}
Statring iteration 19
{'ner': 3.564470754858085}
Enter your Model Name: Test
Enter your testing text: what is price of table ?
table 17 22 PrdName

Pythonist

Sunday, May 10, 2020

Name Entity Recognition on PDF Resume using NLP and spacy python

Name Entity Recognition on PDF Resume using NLP and spacy¶

Custom Entity Training¶

This code for custom Entity i have copied from motiversity Blog use this template to train your own model and entity¶

References¶

6 comments:

Getting started with LakeFS and Apache Iceberg Running Locally