Comparing Different google pre trained Model for Cosine Similarity¶

as you know word embedding plays a very important role we should always select the model after doing the research i took two popular pre trained model from google tensorflow hub and was curious to know is there any difference or change in similarity if i use different models

word1 ="senior Software developer"
word 2 = "Software engineer"

Model 1:¶

universal-sentence-encoder-lite 20 DIM¶

try:
    import elasticsearch
    from elasticsearch import Elasticsearch
    
    import pandas as pd
    import json
    from ast import literal_eval
    from tqdm import tqdm
    import datetime
    import os
    import sys
    import os
    import tensorflow as tf
    import tensorflow_hub as hub
    
    import numpy as np
    
    from math import sqrt
    
    print("Loaded  .. . . . . . . .")
except Exception as E:
    print("Some Modules are Missing {} ".format(e))
    
embed = hub.KerasLayer(os.getcwd())
#url = "https://tfhub.dev/google/universal-sentence-encoder-lite/2"
#embed = hub.KerasLayer(url)

tem = "Software engineer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x1 = x[0].tolist()


tem = "senior Software developer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x2 = x[0].tolist()


from math import sqrt

def cosineSim(a1,a2):
    sum = 0
    suma1 = 0
    sumb1 = 0
    for i,j in zip(a1, a2):
        suma1 += i * i
        sumb1 += j*j
        sum += i*j
    cosine_sim = sum / ((sqrt(suma1))*(sqrt(sumb1)))
    return cosine_sim

print(cosineSim(x1,x2))

Loaded  .. . . . . . . .
0.8708507554664806

tf2-preview/nnlm-en-dim50¶

Collection of feed-forward neural network language token embeddings in SavedModel 2.0 format.

we shall use the same code but i have already converted into Vector to save time and just computing similarity

try:
    import elasticsearch
    from elasticsearch import Elasticsearch
    
    import pandas as pd
    import json
    from ast import literal_eval
    from tqdm import tqdm
    import datetime
    import os
    import sys
    import os
    import tensorflow as tf
    import tensorflow_hub as hub
    
    import numpy as np
    
    from math import sqrt
    
    print("Loaded  .. . . . . . . .")
except Exception as E:
    print("Some Modules are Missing {} ".format(e))
    
#embed = hub.KerasLayer(os.getcwd())
url = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1"
embed = hub.KerasLayer(url)

tem = "Software engineer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x1 = x[0].tolist()


tem = "senior Software developer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x2 = x[0].tolist()


from math import sqrt

def cosineSim(a1,a2):
    sum = 0
    suma1 = 0
    sumb1 = 0
    for i,j in zip(a1, a2):
        suma1 += i * i
        sumb1 += j*j
        sum += i*j
    cosine_sim = sum / ((sqrt(suma1))*(sqrt(sumb1)))
    return cosine_sim

print(cosineSim(x1,x2))

0.7837005327886105

ELMO¶

Embeddings from a language model trained on the 1 Billion Word Benchmark.

try:
    import elasticsearch
    from elasticsearch import Elasticsearch
    
    import pandas as pd
    import json
    from ast import literal_eval
    from tqdm import tqdm
    import datetime
    import os
    import sys
    import os
    import tensorflow as tf
    import tensorflow_hub as hub
    
    import numpy as np
    
    from math import sqrt
    
    print("Loaded  .. . . . . . . .")
except Exception as E:
    print("Some Modules are Missing {} ".format(e))
    
embed = hub.KerasLayer(os.getcwd())
#url = "https://tfhub.dev/google/elmo/3"
#embed = hub.KerasLayer(url)

tem = "Software engineer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x1 = x[0].tolist()


tem = "senior Software developer"
x = tf.constant([tem])
embeddings = embed(x)
x = np.asarray(embeddings)
x2 = x[0].tolist()


from math import sqrt

def cosineSim(a1,a2):
    sum = 0
    suma1 = 0
    sumb1 = 0
    for i,j in zip(a1, a2):
        suma1 += i * i
        sumb1 += j*j
        sum += i*j
    cosine_sim = sum / ((sqrt(suma1))*(sqrt(sumb1)))
    return cosine_sim

print(cosineSim(x1,x2))

0.7253164287990639

Pythonist

Friday, July 17, 2020

Comparing Different google pre trained Model for Cosine Similarity

Comparing Different google pre trained Model for Cosine Similarity¶

Model 1:¶

universal-sentence-encoder-lite 20 DIM¶

tf2-preview/nnlm-en-dim50¶

ELMO¶

No comments:

Post a Comment

Getting started with LakeFS and Apache Iceberg Running Locally