Friday, July 17, 2020

Comparing Different google pre trained Model for Cosine Similarity

Untitled

Comparing Different google pre trained Model for Cosine Similarity

  • as you know word embedding plays a very important role we should always select the model after doing the research i took two popular pre trained model from google tensorflow hub and was curious to know is there any difference or change in similarity if i use different models

  • word1 ="senior Software developer"
  • word 2 = "Software engineer"

Model 1:

universal-sentence-encoder-lite 20 DIM

In [14]:
try:
    import elasticsearch
    from elasticsearch import Elasticsearch
    
    import pandas as pd
    import json
    from ast import literal_eval
    from tqdm import tqdm
    import datetime
    import os
    import sys
    import os
    import tensorflow as tf
    import tensorflow_hub as hub
    
    import numpy as np
    
    from math import sqrt
    
    print("Loaded  .. . . . . . . .")
except Exception as E:
    print("Some Modules are Missing {} ".format(e))
    
embed = hub.KerasLayer(os.getcwd())
#url = "https://tfhub.dev/google/universal-sentence-encoder-lite/2"
#embed = hub.KerasLayer(url)

tem = "Software engineer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x1 = x[0].tolist()


tem = "senior Software developer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x2 = x[0].tolist()


from math import sqrt

def cosineSim(a1,a2):
    sum = 0
    suma1 = 0
    sumb1 = 0
    for i,j in zip(a1, a2):
        suma1 += i * i
        sumb1 += j*j
        sum += i*j
    cosine_sim = sum / ((sqrt(suma1))*(sqrt(sumb1)))
    return cosine_sim

print(cosineSim(x1,x2))
Loaded  .. . . . . . . .
0.8708507554664806

tf2-preview/nnlm-en-dim50

  • Collection of feed-forward neural network language token embeddings in SavedModel 2.0 format.
  • we shall use the same code but i have already converted into Vector to save time and just computing similarity
In [17]:
try:
    import elasticsearch
    from elasticsearch import Elasticsearch
    
    import pandas as pd
    import json
    from ast import literal_eval
    from tqdm import tqdm
    import datetime
    import os
    import sys
    import os
    import tensorflow as tf
    import tensorflow_hub as hub
    
    import numpy as np
    
    from math import sqrt
    
    print("Loaded  .. . . . . . . .")
except Exception as E:
    print("Some Modules are Missing {} ".format(e))
    
#embed = hub.KerasLayer(os.getcwd())
url = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1"
embed = hub.KerasLayer(url)

tem = "Software engineer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x1 = x[0].tolist()


tem = "senior Software developer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x2 = x[0].tolist()


from math import sqrt

def cosineSim(a1,a2):
    sum = 0
    suma1 = 0
    sumb1 = 0
    for i,j in zip(a1, a2):
        suma1 += i * i
        sumb1 += j*j
        sum += i*j
    cosine_sim = sum / ((sqrt(suma1))*(sqrt(sumb1)))
    return cosine_sim

print(cosineSim(x1,x2))
0.7837005327886105

ELMO

  • Embeddings from a language model trained on the 1 Billion Word Benchmark.
In [20]:
try:
    import elasticsearch
    from elasticsearch import Elasticsearch
    
    import pandas as pd
    import json
    from ast import literal_eval
    from tqdm import tqdm
    import datetime
    import os
    import sys
    import os
    import tensorflow as tf
    import tensorflow_hub as hub
    
    import numpy as np
    
    from math import sqrt
    
    print("Loaded  .. . . . . . . .")
except Exception as E:
    print("Some Modules are Missing {} ".format(e))
    
embed = hub.KerasLayer(os.getcwd())
#url = "https://tfhub.dev/google/elmo/3"
#embed = hub.KerasLayer(url)

tem = "Software engineer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x1 = x[0].tolist()


tem = "senior Software developer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x2 = x[0].tolist()


from math import sqrt

def cosineSim(a1,a2):
    sum = 0
    suma1 = 0
    sumb1 = 0
    for i,j in zip(a1, a2):
        suma1 += i * i
        sumb1 += j*j
        sum += i*j
    cosine_sim = sum / ((sqrt(suma1))*(sqrt(sumb1)))
    return cosine_sim

print(cosineSim(x1,x2))
0.7253164287990639

No comments:

Post a Comment

Learn How to configure your Spark Session to Join Managed (S3 Table Buckets) and Unmanaged Iceberg Tables | Hands on Labs

test-tble-bucket-joins Learn How to configure your Spark Session to Join Managed (S...