hello everyone, do you use Machine Learning/Deep Learning or maybe just want to normalize Data set. we know how difficult and Time-consuming process. With this Library you can do that in 3 steps.
This module allows user to Normalize the columns in the dataset perform Train Test Split and also drops if any rows or columns have NULL/NA values
How to Use¶
if __name__ == "__main__":
# ----------------------------------------------------------------------------------------
"""
Step 1:
"""
# Create a Object of Class Data_cleaning
path = "/Users/soumilshah/IdeaProjects/mytensorflow/Dataset/pima-indians-diabetes.csv"
c = Data_Cleaning(path=path)
# -------------------------------------------------------------------------------------
"""
Step 2:
"""
# Rename the Columns
# Provide the all the columns that you want to rename
# Remember id there are 8 columns you have to provide 8 columns
# There is a Method known as get columns run that to get list of columns
# you can avoid step 2 if your column is already renamed
# in that case you can run read_df method and in step 3 you can supply that
columns_to_named = ["Pregnancies","Glucose","BloodPressure",
"SkinThickness","Insulin","BMI","DiabetesPedigreeFunction",
"Age","Class"]
df = c.rename_column(column_rename=columns_to_named)
# ------------------------------------------------------------------------------------------------
"""
Step 3:
"""
# select the Columns that you want to normalize
# and supply the dataFrame pandas dataframe
# select the column which is your classification column
columns_norm = ["Pregnancies","Glucose","BloodPressure",
"SkinThickness","Insulin","BMI","DiabetesPedigreeFunction",
"Age"]
classification_column = ["Class"]
X_Train, X_Test, Y_Train,Y_Test = c.feature_map_train_test_split(df=df,
columns_norm = columns_norm,
classfication_column = classification_column)
# --------------------------------------------------------------------------------------
Code to Library or Module¶
try:
import tensorflow as tf
# for Data Processing
import numpy as np
import pandas as pd
# for Plotting
import matplotlib.pyplot as plt
# for Data Processing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
print('Library Loaded .........')
except:
print('One or More Library was not Found ! ')
class Data_Cleaning(object):
def __init__(self, path=''):
self.path = path
def get_column_names(self):
"""
:return: List of Columns
"""
try:
df = pd.read_csv('{}'.format(self.path))
columns = df.columns
return columns
except:
print("Failed to run get_columns_names")
def get_length(self):
"""
:return: Length of Feature column
"""
try:
length = self.get_column_names()
length = len(length)
return length
except:
print("Failed to execute get_length")
def read_df(self):
"""
:return: Pandas DF
"""
try:
df = pd.read_csv(self.path)
return df
except:
print("Failed to read_df")
def rename_column(self,column_rename = []):
"""
:param column_rename: Should be a list
:return: Pandas DF
"""
try:
# Read the Dataset and Rename the Column
df = pd.read_csv(self.path, header=0, names =column_rename)
return df
except:
print("Failed to rename column")
def feature_map_train_test_split(self, df, columns_norm=[], classfication_column = []):
"""
:param df: SHould be a Pandas DF
:param columns_norm: Should be List of columns that you want to Normalize
:param classfication_column: Should be List of column that you want your Network to classify
:return:
"""
try:
# Select the Feature map Column
df_norm = df[columns_norm].apply(lambda x :( (x - x.min()) / (x.max()-x.min()) ) )
X_Data = df_norm
Y_Data = df[classfication_column]
X_Train, X_Test, Y_Train,Y_Test = train_test_split(X_Data,
Y_Data,
test_size=0.3,
random_state=101)
return X_Train, X_Test, Y_Train,Y_Test
except:
print("Failed to execute feature_map_train_test_split ")
No comments:
Post a Comment