Saturday, May 16, 2020

KNN Machine learning Algorithm on ElasticSearch

Untitled

KNN Machine learning Algorithm on ElasticSearch

Step 1

Import the library

In [1]:
try:
    import elasticsearch
    from elasticsearch import Elasticsearch
    
    import pandas as pd
    import json
    from ast import literal_eval
    from tqdm import tqdm
    import datetime
    import os
    import sys
    import numpy as np
    
    from elasticsearch import helpers
    print("Loaded  .. . . . . . . .")
except Exception as E:
    print("Some Modules are Missing {} ".format(e))
Loaded  .. . . . . . . .
In [21]:
ENDPOINT = "http://localhost:9200"
In [29]:
es = Elasticsearch(timeout=600,hosts=ENDPOINT)
es.ping()
Out[29]:
True

Step 2:

  • Preprocessing

Reading the Dataset

In [4]:
os.listdir()
Out[4]:
['.ipynb_checkpoints', 'netflix_titles.csv', 'Untitled.ipynb']
In [5]:
df= pd.read_csv("netflix_titles.csv")
In [7]:
df.head(1)
Out[7]:
show_id type title director cast country date_added release_year rating duration listed_in description
0 81145628 Movie Norm of the North: King Sized Adventure Richard Finn, Tim Maltby Alan Marriott, Andrew Toth, Brian Dobson, Cole... United States, India, South Korea, China September 9, 2019 2019 TV-PG 90 min Children & Family Movies, Comedies Before planning an awesome wedding for his gra...
In [8]:
titles = df["title"].to_list()
In [9]:
len(titles)
Out[9]:
6234

Step 3:

Convert the Title into Vector using Google Pre trained Machine Learning Model

In [10]:
import tensorflow as tf
import tensorflow_hub as hub

module_url = "https://tfhub.dev/google/nnlm-en-dim128/2"
embed = hub.KerasLayer(module_url)

vector = []

for c, title in enumerate(titles):
    x = tf.constant([title])
    embeddings = embed(x)
    x = np.asarray(embeddings)
    x = x[0].tolist()
    vector.append(x)
In [11]:
len(vector)
Out[11]:
6234
In [12]:
vector[0]
Out[12]:
[0.08594007790088654,
 -0.09169773757457733,
 -0.08221833407878876,
 0.1603367030620575,
 0.05244443565607071,
 0.11267174780368805,
 -0.08382084965705872,
 0.09882047772407532,
 0.021728241816163063,
 -0.18144601583480835,
 0.0012927550124004483,
 0.030685214325785637,
 -0.04533662274479866,
 -0.07281118631362915,
 -0.11955679953098297,
 0.017013853415846825,
 0.033623743802309036,
 -0.009736376814544201,
 0.033763282001018524,
 0.1921098679304123,
 0.00620125001296401,
 0.015555041842162609,
 0.06574436277151108,
 0.11323074996471405,
 -0.10774067789316177,
 0.1693897843360901,
 -0.13922490179538727,
 -0.10309454798698425,
 -5.2244857215555385e-05,
 0.023089049383997917,
 0.04559335112571716,
 -0.10510903596878052,
 -0.1005614772439003,
 -0.07881765812635422,
 0.025743374601006508,
 -0.05974612385034561,
 -0.1747055947780609,
 -0.05892287939786911,
 -0.06596986949443817,
 -0.09151236712932587,
 0.03593139722943306,
 -0.07345644384622574,
 -0.018012331798672676,
 0.036221787333488464,
 0.07314501702785492,
 -0.06195896118879318,
 -0.0023348417598754168,
 -0.1982719600200653,
 -0.3291093707084656,
 0.006821473129093647,
 0.1486814171075821,
 0.2550199031829834,
 0.1663597822189331,
 0.15605349838733673,
 0.12756910920143127,
 -0.057475071400403976,
 0.14456160366535187,
 -0.05416375771164894,
 0.06393317133188248,
 -0.08582285046577454,
 0.019529936835169792,
 0.030426720157265663,
 -0.13159017264842987,
 -0.01176383811980486,
 -0.05212199687957764,
 -0.007775180973112583,
 0.0005310662090778351,
 0.03532465547323227,
 0.14036867022514343,
 -0.04217003658413887,
 -0.0504852756857872,
 0.08859632164239883,
 0.02489238791167736,
 0.036609407514333725,
 0.012656561098992825,
 -0.031059175729751587,
 0.13535012304782867,
 -0.07467728853225708,
 -0.00639297952875495,
 -0.007216154597699642,
 0.10756982862949371,
 -0.03459356725215912,
 0.05434964969754219,
 0.10563021898269653,
 -0.023835688829421997,
 -0.1384897232055664,
 -0.10662095248699188,
 -0.11560706794261932,
 -0.018126854673027992,
 -0.11542601138353348,
 0.05233073979616165,
 -0.08457083255052567,
 0.04891547933220863,
 0.048610806465148926,
 -0.0861951932311058,
 -0.1646905094385147,
 0.05879170447587967,
 -0.09346245974302292,
 0.21104931831359863,
 0.07167480885982513,
 0.09941790252923965,
 -0.04874766618013382,
 -0.11821635812520981,
 -0.11691499501466751,
 -0.04042290896177292,
 -0.035517025738954544,
 0.006470585707575083,
 0.07046835869550705,
 0.032006461173295975,
 -0.017604319378733635,
 0.1958240568637848,
 0.01993837021291256,
 -0.01663972996175289,
 0.11849723011255264,
 -0.10080186277627945,
 -0.009301570244133472,
 0.03264541178941727,
 -0.03453604504466057,
 -0.032728590071201324,
 -0.06038405001163483,
 -0.014748498797416687,
 -0.08714324235916138,
 0.0329294428229332,
 -0.04497246816754341,
 -0.0888349711894989,
 0.02692333422601223,
 0.18709281086921692,
 -0.002944737207144499]

Step 4:

Creating documents
In [13]:
requests = []
for i, doc in enumerate(titles):
    request = {}
    request["_op_type"] = "index"
    request["_index"] = "myml"
    request["_id"] = i
    request["title"] = doc
    request["title_vector"] = vector[i]
    requests.append(request)
In [14]:
requests[0]
Out[14]:
{'_op_type': 'index',
 '_index': 'myml',
 '_id': 0,
 'title': 'Norm of the North: King Sized Adventure',
 'title_vector': [0.08594007790088654,
  -0.09169773757457733,
  -0.08221833407878876,
  0.1603367030620575,
  0.05244443565607071,
  0.11267174780368805,
  -0.08382084965705872,
  0.09882047772407532,
  0.021728241816163063,
  -0.18144601583480835,
  0.0012927550124004483,
  0.030685214325785637,
  -0.04533662274479866,
  -0.07281118631362915,
  -0.11955679953098297,
  0.017013853415846825,
  0.033623743802309036,
  -0.009736376814544201,
  0.033763282001018524,
  0.1921098679304123,
  0.00620125001296401,
  0.015555041842162609,
  0.06574436277151108,
  0.11323074996471405,
  -0.10774067789316177,
  0.1693897843360901,
  -0.13922490179538727,
  -0.10309454798698425,
  -5.2244857215555385e-05,
  0.023089049383997917,
  0.04559335112571716,
  -0.10510903596878052,
  -0.1005614772439003,
  -0.07881765812635422,
  0.025743374601006508,
  -0.05974612385034561,
  -0.1747055947780609,
  -0.05892287939786911,
  -0.06596986949443817,
  -0.09151236712932587,
  0.03593139722943306,
  -0.07345644384622574,
  -0.018012331798672676,
  0.036221787333488464,
  0.07314501702785492,
  -0.06195896118879318,
  -0.0023348417598754168,
  -0.1982719600200653,
  -0.3291093707084656,
  0.006821473129093647,
  0.1486814171075821,
  0.2550199031829834,
  0.1663597822189331,
  0.15605349838733673,
  0.12756910920143127,
  -0.057475071400403976,
  0.14456160366535187,
  -0.05416375771164894,
  0.06393317133188248,
  -0.08582285046577454,
  0.019529936835169792,
  0.030426720157265663,
  -0.13159017264842987,
  -0.01176383811980486,
  -0.05212199687957764,
  -0.007775180973112583,
  0.0005310662090778351,
  0.03532465547323227,
  0.14036867022514343,
  -0.04217003658413887,
  -0.0504852756857872,
  0.08859632164239883,
  0.02489238791167736,
  0.036609407514333725,
  0.012656561098992825,
  -0.031059175729751587,
  0.13535012304782867,
  -0.07467728853225708,
  -0.00639297952875495,
  -0.007216154597699642,
  0.10756982862949371,
  -0.03459356725215912,
  0.05434964969754219,
  0.10563021898269653,
  -0.023835688829421997,
  -0.1384897232055664,
  -0.10662095248699188,
  -0.11560706794261932,
  -0.018126854673027992,
  -0.11542601138353348,
  0.05233073979616165,
  -0.08457083255052567,
  0.04891547933220863,
  0.048610806465148926,
  -0.0861951932311058,
  -0.1646905094385147,
  0.05879170447587967,
  -0.09346245974302292,
  0.21104931831359863,
  0.07167480885982513,
  0.09941790252923965,
  -0.04874766618013382,
  -0.11821635812520981,
  -0.11691499501466751,
  -0.04042290896177292,
  -0.035517025738954544,
  0.006470585707575083,
  0.07046835869550705,
  0.032006461173295975,
  -0.017604319378733635,
  0.1958240568637848,
  0.01993837021291256,
  -0.01663972996175289,
  0.11849723011255264,
  -0.10080186277627945,
  -0.009301570244133472,
  0.03264541178941727,
  -0.03453604504466057,
  -0.032728590071201324,
  -0.06038405001163483,
  -0.014748498797416687,
  -0.08714324235916138,
  0.0329294428229332,
  -0.04497246816754341,
  -0.0888349711894989,
  0.02692333422601223,
  0.18709281086921692,
  -0.002944737207144499]}
Define mappings
In [30]:
settings ={
  "settings": {
    "number_of_shards": 2,
    "number_of_replicas": 1,
    "index.knn": True
  },
   "mappings": {
    "dynamic": "true",
    "_source": {
      "enabled": "true"
    },
    "properties": {
      "title": {
        "type": "text"
      },
      "title_vector": {
        "type": "knn_vector",
        "dimension": 128
      }
    }
  }
}
In [31]:
IndexName = 'myml'
my = es.indices.create(index=IndexName, ignore=[400,404], body=settings)
In [32]:
my
Out[32]:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'myml'}

Step 5:

In [33]:
try:
    res = helpers.bulk(es, requests)
    print("Working")
except Exception as e:
    print(e)
Working

Testing KNN model

we are using cosine similarity to get result in ELK
In [36]:
title = input("Enter query: ")

x = tf.constant([title])
embeddings = embed(x)
x = np.asarray(embeddings)
x = x[0].tolist()


script_query = {
    "script_score": {
        "query": {"match_all": {}},
        "script": {
            "source": "cosineSimilarity(params.query_vector, doc['title_vector']) + 1.0",
            "params": {"query_vector": x}
        }
    }
}

script_query = {
    "knn": {
        "title_vector": {
            "vector": x,
            "k": 2
        }
    }
}

response = es.search(
    index="myml",
    body={
        "size": 10,
        "query": script_query,
        "_source": {"includes": ["title", "body"]}
    }
)

for hit in response["hits"]["hits"]:
    print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
    print(hit["_source"])
    print()
Enter query: Swiss Army Man
id: 3241, score: 1.0
{'title': 'Swiss Army Man'}

id: 1349, score: 0.51294893
{'title': 'American Son'}

id: 6150, score: 0.48458296
{'title': 'Glitter Force'}

id: 4484, score: 0.48386106
{'title': 'A Family Man'}

id: 5785, score: 0.47989023
{'title': 'American Crime'}

id: 3784, score: 0.4735783
{'title': 'Star Men'}

id: 953, score: 0.46827134
{'title': 'Phantom Boy'}

id: 5621, score: 0.46712905
{'title': 'American Vandal'}

id: 5723, score: 0.46373478
{'title': 'Man Down'}

id: 4167, score: 0.46257648
{'title': 'Mercenary'}

3 comments:

  1. Hi, when i try to execute "my = es.indices.create(index=IndexName, ignore=[400,404], body=settings)" why always show error

    {'error': {'root_cause': [{'type': 'illegal_argument_exception',
    'reason': 'unknown setting [index.knn] please check that any required plugins are installed, or check the breaking changes documentation for removed settings'}],
    'type': 'illegal_argument_exception',
    'reason': 'unknown setting [index.knn] please check that any required plugins are installed, or check the breaking changes documentation for removed settings'},
    'status': 400}

    Do you have solutions?

    ReplyDelete
  2. Please, How do I fix error message : RequestError: RequestError(400, 'parsing_exception', 'unknown query [knn]')

    ReplyDelete
  3. to fix the error, you need to use latest version of elasticsearch syntax.
    Example from documentation:
    PUT my-index-2
    {
    "mappings": {
    "properties": {
    "my_vector": {
    "type": "dense_vector",
    "dims": 3,
    "index": true,
    "similarity": "dot_product"
    }
    }
    }
    }

    ReplyDelete

Learn How to Connect to the Glue Data Catalog using AWS Glue Iceberg REST endpoint

gluecat Learn How to Connect to the Glue Data Catalog using AWS Glue Iceberg REST e...