Power of Semantics Search combined with Elastic Search | ML on ELK ¶
Soumil Nitin Shah¶
Bachelor in Electronic Engineering | Masters in Electrical Engineering | Master in Computer Engineering |
- Website : https://soumilshah.herokuapp.com
- Github: https://github.com/soumilshah1995
- Linkedin: https://www.linkedin.com/in/shah-soumil/
- Blog: https://soumilshah1995.blogspot.com/
- Youtube : https://www.youtube.com/channel/UC_eOodxvwS_H7x2uLQa-svw?view_as=subscriber
- Facebook Page : https://www.facebook.com/soumilshah1995/
- Email : shahsoumil519@gmail.com
Step 1: Define imports¶
In [4]:
try:
import json
import os
import uuid
import pandas as pd
import numpy as np
import elasticsearch
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv("secret.env")
except Exception as e:
print("Some Modules are Missing :{}".format(e))
Step 2: Define helper classes¶
In [5]:
class Reader(object):
def __init__(self, file_name):
self.file_name = file_name
def run(self):
df = pd.read_csv(self.file_name, chunksize=3000)
df = next(df)
df = df.fillna("")
return df
- This class will Convert given text into Tokens
In [6]:
class Tokenizer(object):
def __init__(self):
self.model = SentenceTransformer('all-MiniLM-L6-v2')
def get_token(self, documents):
sentences = [documents]
sentence_embeddings = self.model.encode(sentences)
_ = list(sentence_embeddings.flatten())
encod_np_array = np.array(_)
encod_list = encod_np_array.tolist()
return encod_list
In [7]:
class ElasticSearchImports(object):
def __init__(self, df, index_name='posting'):
self.df = df
self.index_name = index_name
self.es = Elasticsearch(timeout=600,hosts=os.getenv("ENDPOINT"))
def run(self):
elk_data = self.df.to_dict("records")
for job in elk_data:
try:self.es.index(index=self.index_name,body=job)
except Exception as e:pass
return True
Step 3: Converting column to Vector Enbeddings¶
In [25]:
helper = Reader(file_name="data job posts.csv")
df = helper.run()
In [26]:
tqdm.pandas()
helper_token = Tokenizer()
df["vectors"] = df["jobpost"].progress_apply(helper_token.get_token)
In [70]:
helper_elk = ElasticSearchImports(df=df)
helper_elk.run()
Out[70]:
Step 4: test¶
In [14]:
helper_token = Tokenizer()
INPUT = input("Enter the Input Query ")
token_vector = helper_token.get_token(INPUT)
query ={
"size":50,
"_source": "Title",
"query":{
"bool":{
"must":[
{
"knn":{
"vectors":{
"vector":token_vector,
"k":20
}
}
}
]
}
}
}
es = Elasticsearch(timeout=600, hosts=os.getenv("ENDPOINT"))
res = es.search(index='posting',
size=50,
body=query,
request_timeout=55)
title = [x['_source'] for x in res['hits']['hits']]
title
Out[14]:
In [ ]: