KNN Machine learning Algorithm on ElasticSearch¶
Step 1¶
Import the library
In [1]:
try:
import elasticsearch
from elasticsearch import Elasticsearch
import pandas as pd
import json
from ast import literal_eval
from tqdm import tqdm
import datetime
import os
import sys
import numpy as np
from elasticsearch import helpers
print("Loaded .. . . . . . . .")
except Exception as E:
print("Some Modules are Missing {} ".format(e))
In [21]:
ENDPOINT = "http://localhost:9200"
In [29]:
es = Elasticsearch(timeout=600,hosts=ENDPOINT)
es.ping()
Out[29]:
Step 2:¶
- Preprocessing
Reading the Dataset¶
In [4]:
os.listdir()
Out[4]:
In [5]:
df= pd.read_csv("netflix_titles.csv")
In [7]:
df.head(1)
Out[7]:
In [8]:
titles = df["title"].to_list()
In [9]:
len(titles)
Out[9]:
In [10]:
import tensorflow as tf
import tensorflow_hub as hub
module_url = "https://tfhub.dev/google/nnlm-en-dim128/2"
embed = hub.KerasLayer(module_url)
vector = []
for c, title in enumerate(titles):
x = tf.constant([title])
embeddings = embed(x)
x = np.asarray(embeddings)
x = x[0].tolist()
vector.append(x)
In [11]:
len(vector)
Out[11]:
In [12]:
vector[0]
Out[12]:
In [13]:
requests = []
for i, doc in enumerate(titles):
request = {}
request["_op_type"] = "index"
request["_index"] = "myml"
request["_id"] = i
request["title"] = doc
request["title_vector"] = vector[i]
requests.append(request)
In [14]:
requests[0]
Out[14]:
Define mappings¶
In [30]:
settings ={
"settings": {
"number_of_shards": 2,
"number_of_replicas": 1,
"index.knn": True
},
"mappings": {
"dynamic": "true",
"_source": {
"enabled": "true"
},
"properties": {
"title": {
"type": "text"
},
"title_vector": {
"type": "knn_vector",
"dimension": 128
}
}
}
}
In [31]:
IndexName = 'myml'
my = es.indices.create(index=IndexName, ignore=[400,404], body=settings)
In [32]:
my
Out[32]:
In [33]:
try:
res = helpers.bulk(es, requests)
print("Working")
except Exception as e:
print(e)
Testing KNN model¶
we are using cosine similarity to get result in ELK¶
In [36]:
title = input("Enter query: ")
x = tf.constant([title])
embeddings = embed(x)
x = np.asarray(embeddings)
x = x[0].tolist()
script_query = {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, doc['title_vector']) + 1.0",
"params": {"query_vector": x}
}
}
}
script_query = {
"knn": {
"title_vector": {
"vector": x,
"k": 2
}
}
}
response = es.search(
index="myml",
body={
"size": 10,
"query": script_query,
"_source": {"includes": ["title", "body"]}
}
)
for hit in response["hits"]["hits"]:
print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
print(hit["_source"])
print()
Hi, when i try to execute "my = es.indices.create(index=IndexName, ignore=[400,404], body=settings)" why always show error
ReplyDelete{'error': {'root_cause': [{'type': 'illegal_argument_exception',
'reason': 'unknown setting [index.knn] please check that any required plugins are installed, or check the breaking changes documentation for removed settings'}],
'type': 'illegal_argument_exception',
'reason': 'unknown setting [index.knn] please check that any required plugins are installed, or check the breaking changes documentation for removed settings'},
'status': 400}
Do you have solutions?
Please, How do I fix error message : RequestError: RequestError(400, 'parsing_exception', 'unknown query [knn]')
ReplyDeleteto fix the error, you need to use latest version of elasticsearch syntax.
ReplyDeleteExample from documentation:
PUT my-index-2
{
"mappings": {
"properties": {
"my_vector": {
"type": "dense_vector",
"dims": 3,
"index": true,
"similarity": "dot_product"
}
}
}
}