Source code for hmm.classification

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from snorkel.analysis import metric_score
import numpy as np


[docs]class Classifier: def __init__(self, num_features, cat_features, clf=RandomForestClassifier(n_estimators=100)): """ A simple classification pipeline wrapping the `sklearn` library. - transforms (imputes, encodes/scales) categorical and numerical features - fits a classifier - computes accuracy scores for the classifier :param num_features: a list of df keys for the numerical features :param cat_features: a list of df keys for the categorical features :param clf: a classification (discriminative) model """ # categorical features are imputed with a constant and one-hot encoded cat_transformer = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('oh_enc', OneHotEncoder(handle_unknown='ignore')) ]) # numerical features are imputed with the median and normalized num_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) # a preprocessor for all the features, column by column self.preprocessor = ColumnTransformer( transformers=[ ('num', num_transformer, num_features), ('cat', cat_transformer, cat_features) ] ) self.clf = self.get_clf(clf)
[docs] def get_clf(self, model): """ Construct the pipeline - a feature preprocessor and a classification model. :param model: a classification (discriminative) model :return: a `sklearn` pipeline """ return Pipeline([ ('preprocessor', self.preprocessor), ('classifier', model) ])
[docs] def fit(self, X, y): """ Fit the pipeline on a labeled dataset. :param X: the data :param y: the ground-truth labels :return: the fitted pipeline """ self.clf.fit(X, y)
[docs] def score(self, X, y, verbose=True): """ Score the pipeline for accuracy on a test set. :param X: the test data :param y: ground-truth labels for the test data :param verbose: whether or not to print test accuracy :return: accuracy on the test set """ # calculate rounded predictions preds_test = np.round(self.clf.predict(X)) # calculate the accuracy test_acc = metric_score(golds=y, preds=preds_test, metric="accuracy") if verbose: print(f"Test Accuracy: {test_acc * 100:.1f}%") return test_acc
[docs] def cross_val(self, X, y, cv=5, verbose=True): """ Cross validate the pipeline. :param X: a dataset :param y: the ground-truth labels :param cv: number of folds in the cross validation :param verbose: whether or not to print test accuracy :return: the cross validation score object (`sklearn`) """ return cross_validate(self.clf, X, y, cv=cv, verbose=verbose)
[docs]def train_test_val_dev_split(X, y): """ Split the dataset into four partitions: training (64%), testing (16%), validation (16%), and development (4%). - Training is for fitting the model. - Testing is for testing the fitted model and parameter tuning. - Validation is for final testing after tuning parameters. - Development is for examining individual rows and performing unit tests. :param X: the dataset :param y: the ground-truth labels :return: four partitions of the dataset """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1) X_test, X_dev, y_test, y_dev = train_test_split(X_test, y_test, test_size=0.2, random_state=1) return X_train, X_test, X_val, X_dev, y_train, y_test, y_val, y_dev