Develop Full Text Search (Semantics Search) with Postgres (PGVector) and Python Hands on Labs¶
Download Dataset from Kaggle¶
Postgres with PgVector¶
# Compose a postgres database together with the extension pgvector
services:
db:
hostname: db
image: ankane/pgvector
ports:
- 5432:5432
restart: always
environment:
- POSTGRES_DB=vectordb
- POSTGRES_USER=testuser
- POSTGRES_PASSWORD=testpwd
- POSTGRES_HOST_AUTH_METHOD=trust
In [1]:
try:
import json
import os
import uuid
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import psycopg2
from psycopg2 import sql
from tqdm import tqdm
except Exception as e:
print("Some Modules are Missing :{}".format(e))
Define Helper methods¶
In [2]:
class Reader(object):
def __init__(self, file_name):
self.file_name = file_name
def run(self):
df = pd.read_csv(self.file_name, chunksize=3000)
df = next(df)
df = df.fillna("")
return df
In [3]:
class Tokenizer(object):
def __init__(self):
self.model = SentenceTransformer('all-MiniLM-L6-v2')
def get_token(self, documents):
sentences = [documents]
sentence_embeddings = self.model.encode(sentences)
_ = list(sentence_embeddings.flatten())
encod_np_array = np.array(_)
encod_list = encod_np_array.tolist()
return encod_list
Read Data and create Vectors¶
In [4]:
helper = Reader(file_name="data job posts.csv")
df = helper.run()
tqdm.pandas()
helper_token = Tokenizer()
df["vectors"] = df["jobpost"].progress_apply(helper_token.get_token)
Create Table in Postgres¶
In [5]:
%load_ext sql
%sql postgresql+psycopg2://testuser:testpwd@127.0.0.1:5432/vectordb
In [6]:
%%sql
CREATE EXTENSION IF NOT EXISTS vector;
Out[6]:
In [7]:
%%sql
CREATE TABLE jobs (
jobpost TEXT,
date TEXT,
Title TEXT,
Company TEXT,
AnnouncementCode TEXT,
Term TEXT,
Eligibility TEXT,
Audience TEXT,
StartDate TEXT,
Duration TEXT,
Location TEXT,
JobDescription TEXT,
JobRequirment TEXT,
RequiredQual TEXT,
Salary TEXT,
ApplicationP TEXT,
OpeningDate TEXT,
Deadline TEXT,
Notes TEXT,
AboutC TEXT,
Attach TEXT,
Year TEXT,
Month TEXT,
IT TEXT,
vectors VECTOR
);
Out[7]:
In [8]:
df.head(2)
Out[8]:
Define helper frunctions¶
In [9]:
def connect_to_db():
dbname = 'vectordb'
user = 'testuser'
password = 'testpwd'
host = '127.0.0.1'
port = '5432'
conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host, port=port)
return conn
def insert_data_into_table(row, conn):
# Create a cursor
cur = conn.cursor()
# Define the insert statement
insert_statement = sql.SQL("""
INSERT INTO jobs (
jobpost, date, Title, Company, AnnouncementCode, Term,
Eligibility, Audience, StartDate, Duration, Location,
JobDescription, JobRequirment, RequiredQual, Salary,
ApplicationP, OpeningDate, Deadline, Notes, AboutC,
Attach, Year, Month, IT, vectors
) VALUES (
%s, %s, %s, %s, %s, %s,
%s, %s, %s, %s, %s,
%s, %s, %s, %s,
%s, %s, %s, %s, %s,
%s, %s, %s, %s, %s
)
""")
# Convert row values to a tuple, handling empty date values
values = [val if val != '' else None for val in row]
# Execute the insert statement
cur.execute(insert_statement, values)
# Commit the transaction
conn.commit()
# Close the cursor
cur.close()
# Function to query similar documents based on embedding
def find_similar_documents(embedding):
connection = connect_to_db()
cursor = connection.cursor()
try:
# Query to find similar documents using cosine similarity
query = f"""
SELECT Title, 1 - (vectors <=> '{embedding}') AS cosine_similarity
FROM jobs
ORDER BY cosine_similarity DESC
LIMIT 3
"""
cursor.execute(query)
similar_documents = cursor.fetchall()
return similar_documents
except Exception as error:
print("Error:", error)
return None
finally:
cursor.close()
connection.close()
ingest Documents into Postgres¶
In [10]:
# Connect to the PostgreSQL database
conn = connect_to_db()
# Iterate through each row in the DataFrame and insert into the table
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Inserting data into PostgreSQL"):
insert_data_into_table(row, conn)
# Close the database connection
conn.close()
Query Data¶
¶
i have experience in python and aws can you suggest or recommend me jobs ?
In [11]:
helper_token = Tokenizer()
INPUT = input("Enter the Input Query ")
token_vector = helper_token.get_token(INPUT)
# Find similar documents based on the token vector
similar_docs = find_similar_documents(token_vector)
# Print the similar documents
if similar_docs:
print("Similar Documents:")
for doc in similar_docs:
print(doc)
else:
print("No similar documents found.")