Introduction of several Classification packages in machine learning(Python3).
Before we use many ML algorithm, we sometimes need to preprocess the data
Import all package
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
import numpy as np import pandas as pd import statsmodels as sm
from sklearn import preprocessing from sklearn.model_selection import train_test_split # Import train_test_split function from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
#import model package from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn import tree from sklearn.ensemble import GradientBoostingClassifier from sklearn.svm import SVC
Preprocessing
Function to encode the data type, change all object type features into dummy variables
1 2 3 4 5 6
defencoder(df): for column in df.columns: if df[column].dtype == type(object): le = preprocessing.LabelEncoder() df[column] = le.fit_transform(df[column]) return df
Or we can use the package in scikit learn of data transformation. Where fit() is to calculate properties such like mean, min, max and so on for training process. transform() is to apply normalise, reduce dimension and regularization base on fitted data. fit_transform combined these two methods.
1 2 3 4 5 6 7 8 9
from sklearn.preprocessing import StandardScaler sc = StandardScaler() sc.fit_tranform(x_train) sc.tranform(x_test)
from sklearn.preprocessing import MinMaxScaler mms = MinMaxScaler() mms.fit_tranform(x_train) mms.tranform(x_test)
Using PCA as an example of reducing dimension
1 2 3
from sklearn.decomposition import PCA pca = PCA(n_components = 3) pca.fit_transdorm()
Spliting the data frame into train and test to varify the goodness of model
1 2
# a fixed randome_state num will have same train and test set train_x, test_x, train_y, test_y = train_test_split(dfhouse, df_encode[["prod_id"]], test_size=0.2, random_state=42)
Logistic Regression Classification
1 2 3 4 5 6 7 8
deflogistic_regression_classifier(train_x, train_y): model = LogisticRegression(penalty='l2', solver = "lbfgs", multi_class='auto') model.fit(train_x, train_y) return model
defrandom_forest_classifier(train_x, train_y): model = RandomForestClassifier(max_depth=2, random_state=0, ) model.fit(train_x, train_y) return model
rf_model = random_forest_classifier(train_x, train_y) rf_model.feature_importances_ # show feature importance for each feature model = SelectFromModel(rf_model, prefit=True) # select no zero coefficient features x_new = model.transform(x) x_new.shape()
train_x_rf = train_x.iloc[:, my_list] # can mutually define my_list = [] to select important feature metrics.accuracy_score(test_y, pred_y)
Decision Tree Classifier
1 2 3 4 5 6 7 8
defdecision_tree_classifier(train_x, train_y): model = tree.DecisionTreeClassifier() model.fit(train_x, train_y) return model