Friday, July 17, 2020

Comparing Different google pre trained Model for Cosine Similarity

Untitled

Comparing Different google pre trained Model for Cosine Similarity

  • as you know word embedding plays a very important role we should always select the model after doing the research i took two popular pre trained model from google tensorflow hub and was curious to know is there any difference or change in similarity if i use different models

  • word1 ="senior Software developer"
  • word 2 = "Software engineer"

Model 1:

universal-sentence-encoder-lite 20 DIM

In [14]:
try:
    import elasticsearch
    from elasticsearch import Elasticsearch
    
    import pandas as pd
    import json
    from ast import literal_eval
    from tqdm import tqdm
    import datetime
    import os
    import sys
    import os
    import tensorflow as tf
    import tensorflow_hub as hub
    
    import numpy as np
    
    from math import sqrt
    
    print("Loaded  .. . . . . . . .")
except Exception as E:
    print("Some Modules are Missing {} ".format(e))
    
embed = hub.KerasLayer(os.getcwd())
#url = "https://tfhub.dev/google/universal-sentence-encoder-lite/2"
#embed = hub.KerasLayer(url)

tem = "Software engineer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x1 = x[0].tolist()


tem = "senior Software developer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x2 = x[0].tolist()


from math import sqrt

def cosineSim(a1,a2):
    sum = 0
    suma1 = 0
    sumb1 = 0
    for i,j in zip(a1, a2):
        suma1 += i * i
        sumb1 += j*j
        sum += i*j
    cosine_sim = sum / ((sqrt(suma1))*(sqrt(sumb1)))
    return cosine_sim

print(cosineSim(x1,x2))
Loaded  .. . . . . . . .
0.8708507554664806

tf2-preview/nnlm-en-dim50

  • Collection of feed-forward neural network language token embeddings in SavedModel 2.0 format.
  • we shall use the same code but i have already converted into Vector to save time and just computing similarity
In [17]:
try:
    import elasticsearch
    from elasticsearch import Elasticsearch
    
    import pandas as pd
    import json
    from ast import literal_eval
    from tqdm import tqdm
    import datetime
    import os
    import sys
    import os
    import tensorflow as tf
    import tensorflow_hub as hub
    
    import numpy as np
    
    from math import sqrt
    
    print("Loaded  .. . . . . . . .")
except Exception as E:
    print("Some Modules are Missing {} ".format(e))
    
#embed = hub.KerasLayer(os.getcwd())
url = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1"
embed = hub.KerasLayer(url)

tem = "Software engineer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x1 = x[0].tolist()


tem = "senior Software developer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x2 = x[0].tolist()


from math import sqrt

def cosineSim(a1,a2):
    sum = 0
    suma1 = 0
    sumb1 = 0
    for i,j in zip(a1, a2):
        suma1 += i * i
        sumb1 += j*j
        sum += i*j
    cosine_sim = sum / ((sqrt(suma1))*(sqrt(sumb1)))
    return cosine_sim

print(cosineSim(x1,x2))
0.7837005327886105

ELMO

  • Embeddings from a language model trained on the 1 Billion Word Benchmark.
In [20]:
try:
    import elasticsearch
    from elasticsearch import Elasticsearch
    
    import pandas as pd
    import json
    from ast import literal_eval
    from tqdm import tqdm
    import datetime
    import os
    import sys
    import os
    import tensorflow as tf
    import tensorflow_hub as hub
    
    import numpy as np
    
    from math import sqrt
    
    print("Loaded  .. . . . . . . .")
except Exception as E:
    print("Some Modules are Missing {} ".format(e))
    
embed = hub.KerasLayer(os.getcwd())
#url = "https://tfhub.dev/google/elmo/3"
#embed = hub.KerasLayer(url)

tem = "Software engineer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x1 = x[0].tolist()


tem = "senior Software developer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x2 = x[0].tolist()


from math import sqrt

def cosineSim(a1,a2):
    sum = 0
    suma1 = 0
    sumb1 = 0
    for i,j in zip(a1, a2):
        suma1 += i * i
        sumb1 += j*j
        sum += i*j
    cosine_sim = sum / ((sqrt(suma1))*(sqrt(sumb1)))
    return cosine_sim

print(cosineSim(x1,x2))
0.7253164287990639

No comments:

Post a Comment

Learn How to Connect to the Glue Data Catalog using AWS Glue Iceberg REST endpoint

gluecat Learn How to Connect to the Glue Data Catalog using AWS Glue Iceberg REST e...