Source code for hmm.classification

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from snorkel.analysis import metric_score
import numpy as np


[docs]class Classifier:
	def __init__(self, num_features, cat_features, clf=RandomForestClassifier(n_estimators=100)):
		"""
		A simple classification pipeline wrapping the `sklearn` library.

		- transforms (imputes, encodes/scales) categorical and numerical features
		- fits a classifier
		- computes accuracy scores for the classifier

		:param num_features: a list of df keys for the numerical features
		:param cat_features: a list of df keys for the categorical features
		:param clf: a classification (discriminative) model
		"""

		# categorical features are imputed with a constant and one-hot encoded
		cat_transformer = Pipeline([
			('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
			('oh_enc', OneHotEncoder(handle_unknown='ignore'))
		])
		# numerical features are imputed with the median and normalized
		num_transformer = Pipeline(steps=[
			('imputer', SimpleImputer(strategy='median')),
			('scaler', StandardScaler())
		])
		# a preprocessor for all the features, column by column
		self.preprocessor = ColumnTransformer(
			transformers=[
				('num', num_transformer, num_features),
				('cat', cat_transformer, cat_features)
			]
		)
		self.clf = self.get_clf(clf)

[docs]	def get_clf(self, model):
		"""
		Construct the pipeline - a feature preprocessor and a classification model.

		:param model: a classification (discriminative) model
		:return: a `sklearn` pipeline
		"""
		return Pipeline([
			('preprocessor', self.preprocessor),
			('classifier', model)
		])

[docs]	def fit(self, X, y):
		"""
		Fit the pipeline on a labeled dataset.

		:param X: the data
		:param y: the ground-truth labels
		:return: the fitted pipeline
		"""
		self.clf.fit(X, y)

[docs]	def score(self, X, y, verbose=True):
		"""
		Score the pipeline for accuracy on a test set.

		:param X: the test data
		:param y: ground-truth labels for the test data
		:param verbose: whether or not to print test accuracy
		:return: accuracy on the test set
		"""
		# calculate rounded predictions
		preds_test = np.round(self.clf.predict(X))
		# calculate the accuracy
		test_acc = metric_score(golds=y, preds=preds_test, metric="accuracy")
		if verbose:
			print(f"Test Accuracy: {test_acc * 100:.1f}%")
		return test_acc

[docs]	def cross_val(self, X, y, cv=5, verbose=True):
		"""
		Cross validate the pipeline.

		:param X: a dataset
		:param y: the ground-truth labels
		:param cv: number of folds in the cross validation
		:param verbose: whether or not to print test accuracy
		:return: the cross validation score object (`sklearn`)
		"""
		return cross_validate(self.clf, X, y, cv=cv, verbose=verbose)


[docs]def train_test_val_dev_split(X, y):
	"""
	Split the dataset into four partitions: training (64%), testing (16%), validation (16%), and development (4%).
	- Training is for fitting the model.
	- Testing is for testing the fitted model and parameter tuning.
	- Validation is for final testing after tuning parameters.
	- Development is for examining individual rows and performing unit tests.

	:param X: the dataset
	:param y: the ground-truth labels
	:return: four partitions of the dataset
	"""
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
	X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
	X_test, X_dev, y_test, y_dev = train_test_split(X_test, y_test, test_size=0.2, random_state=1)
	return X_train, X_test, X_val, X_dev, y_train, y_test, y_val, y_dev