Sunday, May 10, 2020

Name Entity Recognition on PDF Resume using NLP and spacy python


In [22]:
    import spacy
    import json
except Exception as e:

class EntityGenerator(object):
    _slots__ = ['text']
    def __init__(self, text=None):
        self.text = text
    def get(self):
        Return a Json
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(self.text)
        text = [ent.text for ent in doc.ents]
        entity = [ent.label_ for ent in doc.ents]
        from collections import Counter
        import json

        data = Counter(zip(entity))
        unique_entity = list(data.keys())
        unique_entity = [x[0] for x in unique_entity]

        d = {}
        for val in unique_entity:
            d[val] = []

        for key,val in dict(zip(text, entity)).items():
            if val in unique_entity:
        return d
In [23]:
    import PyPDF2
    import requests
    import json
except Exception:

class Resume(object):
    def __init__(self, filename=None):
        self.filename = filename
    def get(self):
        fFileObj = open(self.filename, 'rb')
        pdfReader = PyPDF2.PdfFileReader(fFileObj)
        pageObj = pdfReader.getPage(0)
        print("Total Pages : {} ".format(pdfReader.numPages))

        resume = pageObj.extractText()
        return resume
In [24]:
resume = Resume(filename="0.pdf")
response_news = resume.get()
Total Pages : 1 
In [25]:
helper = EntityGenerator(text=response_news)
response = helper.get()
print(json.dumps(response , indent=3))
   "ORG": [
      "University of Bridgeport   \n                                                                                                                                                                                                                              ",
      "University of Mumbai",
      "Electronic Engineering",
      "University of Bridgeport",
      "NASA C",
      "Space Grant\n              University of \nBridgeport",
      "Applications and Technology Conference",
   "CARDINAL": [
   "DATE": [
      "May 2020",
      "May 2013",
      "May 3",
      "10 \nYears"
   "GPE": [
      "-soumil/ |\n  "
   "PERCENT": [
      "80 %"
   "PERSON": [
      "Machine Learning",
      "|\n ",
   "LAW": [
      "Air Pollution"
   "NORP": [
   "LOC": []

Custom Entity Training

This code for custom Entity i have copied from motiversity Blog use this template to train your own model and entity
In [67]:
import spacy
import random
In [68]:
TRAIN_DATA = [('what is the price of polo?', {'entities': [(21, 25, 'PrdName')]}), 
              ('what is the price of ball?', {'entities': [(21, 25, 'PrdName')]}), 
              ('what is the price of jegging?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of t-shirt?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of jeans?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of bat?', {'entities': [(21, 24, 'PrdName')]}), 
              ('what is the price of shirt?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of bag?', {'entities': [(21, 24, 'PrdName')]}), 
              ('what is the price of cup?', {'entities': [(21, 24, 'PrdName')]}), 
              ('what is the price of jug?', {'entities': [(21, 24, 'PrdName')]}), 
              ('what is the price of plate?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of glass?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of moniter?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of desktop?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of bottle?', {'entities': [(21, 27, 'PrdName')]}), 
              ('what is the price of mouse?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of keyboad?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of chair?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of table?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of watch?', {'entities': [(21, 26, 'PrdName')]})]
In [69]:
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            losses = {}
            for text, annotations in TRAIN_DATA:
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
    return nlp

prdnlp = train_spacy(TRAIN_DATA, 20)

# Save our trained Model
modelfile = input("Enter your Model Name: ")

#Test your text
test_text = input("Enter your testing text: ")
doc = prdnlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
Statring iteration 0
{'ner': 43.70497216920921}
Statring iteration 1
{'ner': 2.710291442944125}
Statring iteration 2
{'ner': 1.996694022562065}
Statring iteration 3
{'ner': 2.1947250592188867}
Statring iteration 4
{'ner': 0.9058895410132363}
Statring iteration 5
{'ner': 1.167754612674311}
Statring iteration 6
{'ner': 6.173202133882057}
Statring iteration 7
{'ner': 3.2939778793527186}
Statring iteration 8
{'ner': 2.1191635131465687}
Statring iteration 9
{'ner': 5.807327533150592}
Statring iteration 10
{'ner': 1.1973480416004578}
Statring iteration 11
{'ner': 2.1269163827046267}
Statring iteration 12
{'ner': 2.504892537738018}
Statring iteration 13
{'ner': 1.8551057868492804}
Statring iteration 14
{'ner': 2.9309291534996826}
Statring iteration 15
{'ner': 3.1783536320935912}
Statring iteration 16
{'ner': 2.0050045232922225}
Statring iteration 17
{'ner': 5.117727673072695}
Statring iteration 18
{'ner': 2.1752474400122725}
Statring iteration 19
{'ner': 3.564470754858085}
Enter your Model Name: Test
Enter your testing text: what is price of table ?
table 17 22 PrdName



