# Source code for skll.learner

```
"""
An easy-to-use class that wraps scikit-learn estimators.
:author: Nitin Madnani (nmadnani@ets.org)
:author: Michael Heilman (mheilman@ets.org)
:author: Dan Blanchard (dblanchard@ets.org)
:author: Aoife Cahill (acahill@ets.org)
:organization: ETS
"""
import copy
import logging
from importlib import import_module
from itertools import combinations
from math import floor, log10
from multiprocessing import cpu_count
from typing import Any, Dict, List, Optional, Tuple, Union
import joblib
import numpy as np
import scipy.sparse as sp
from sklearn.dummy import DummyClassifier, DummyRegressor # noqa: F401
from sklearn.ensemble import (
AdaBoostClassifier,
AdaBoostRegressor,
BaggingClassifier,
BaggingRegressor,
GradientBoostingClassifier,
GradientBoostingRegressor,
HistGradientBoostingClassifier,
HistGradientBoostingRegressor,
RandomForestClassifier,
RandomForestRegressor,
)
from sklearn.feature_extraction import FeatureHasher
from sklearn.kernel_approximation import ( # noqa: F401
AdditiveChi2Sampler,
Nystroem,
RBFSampler,
SkewedChi2Sampler,
)
from sklearn.linear_model import (
BayesianRidge,
ElasticNet,
HuberRegressor,
Lars,
Lasso,
LinearRegression,
LogisticRegression,
RANSACRegressor,
Ridge,
RidgeClassifier,
SGDClassifier,
SGDRegressor,
TheilSenRegressor,
)
from sklearn.linear_model._base import LinearModel
from sklearn.metrics import get_scorer_names, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor # noqa: F401
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import shuffle as sk_shuffle
from sklearn.utils.multiclass import type_of_target
from skll.data import FeatureSet
from skll.data.dict_vectorizer import DictVectorizer
from skll.data.readers import safe_float
from skll.metrics import _CUSTOM_METRICS
from skll.types import (
CrossValidateTaskResults,
EvaluateTaskResults,
FoldMapping,
IndexIterator,
LabelType,
LearningCurveSizes,
PathOrStr,
)
from skll.utils.constants import (
CORRELATION_METRICS,
KNOWN_DEFAULT_PARAM_GRIDS,
KNOWN_REQUIRES_DENSE,
MAX_CONCURRENT_PROCESSES,
)
from .utils import (
Densifier,
FilteredLeaveOneGroupOut,
SelectByMinCount,
_load_learner_from_disk,
_save_learner_to_disk,
add_unseen_labels,
compute_evaluation_metrics,
compute_num_folds_from_example_counts,
get_acceptable_classification_metrics,
get_acceptable_regression_metrics,
get_predictions,
load_custom_learner,
rescaled,
setup_cv_fold_iterator,
setup_cv_split_iterator,
train_and_score,
write_predictions,
)
# we need a list of learners requiring dense input and a dictionary of
# default parameter grids that we can dynamically update in case we
# import a custom learner
_REQUIRES_DENSE = copy.copy(KNOWN_REQUIRES_DENSE)
_DEFAULT_PARAM_GRIDS = copy.deepcopy(KNOWN_DEFAULT_PARAM_GRIDS)
__all__ = ["Learner", "MAX_CONCURRENT_PROCESSES", "load_custom_learner"]
[docs]
class Learner(object):
"""
A simpler interface around scikit-learn classification and regression estimators.
Parameters
----------
model_type : str
Name of estimator to create (e.g., ``'LogisticRegression'``).
See the skll package documentation for valid options.
probability : bool, default=False
Should learner return probabilities of all
labels (instead of just label with highest probability)?
pipeline : bool, default=False
Should learner contain a pipeline attribute that
contains a scikit-learn Pipeline object composed
of all steps including the vectorizer, the feature
selector, the sampler, the feature scaler, and the
actual estimator. Note that this will increase the
size of the learner object in memory and also when
it is saved to disk.
feature_scaling : str, default="none"
How to scale the features, if at all. Options are
- 'with_std': scale features using the standard deviation
- 'with_mean': center features using the mean
- 'both': do both scaling as well as centering
- 'none': do neither scaling nor centering
model_kwargs : Optional[Dict[str, Any]], default=None
A dictionary of keyword arguments to pass to the
initializer for the specified model.
pos_label : Optional[:class:`skll.types.LabelType`], default=None
An integer or string denoting the label of the class to be
treated as the positive class in a binary classification
setting. If ``None``, the class represented by the label
that appears second when sorted is chosen as the positive
class. For example, if the two labels in data are "A"
and "B" and ``pos_label`` is not specified, "B" will
be chosen as the positive class.
min_feature_count : int, default=1
The minimum number of examples a feature
must have a nonzero value in to be included.
sampler : Optional[str], default=None
The sampler to use for kernel approximation, if desired.
Valid values are
- 'AdditiveChi2Sampler'
- 'Nystroem'
- 'RBFSampler'
- 'SkewedChi2Sampler'
sampler_kwargs : Optional[Dict[str, Any]], default=None
A dictionary of keyword arguments to pass to the
initializer for the specified sampler.
custom_learner_path : Optional[str], default=None
Path to module where a custom classifier is defined.
logger : Optional[logging.Logger], default=None
A logging object. If ``None`` is passed, get logger from ``__name__``.
"""
def __init__(
self,
model_type: str,
probability: bool = False,
pipeline: bool = False,
feature_scaling: str = "none",
model_kwargs: Optional[Dict[str, Any]] = None,
pos_label: Optional[LabelType] = None,
min_feature_count: int = 1,
sampler: Optional[str] = None,
sampler_kwargs: Optional[Dict[str, Any]] = None,
custom_learner_path: Optional[PathOrStr] = None,
logger: Optional[logging.Logger] = None,
) -> None:
"""Initialize a learner object with the specified settings."""
super(Learner, self).__init__()
self.feat_vectorizer: Optional[Union[DictVectorizer, FeatureHasher]] = None
self.scaler: Optional[StandardScaler] = None
self.label_dict: Dict[LabelType, int] = {}
self.label_list: List[LabelType] = []
self.pos_label = safe_float(pos_label) if pos_label is not None else pos_label
self._model = None
self._store_pipeline = pipeline
self._feature_scaling = feature_scaling
self._min_feature_count = min_feature_count
self.feat_selector: SelectByMinCount = SelectByMinCount(min_count=self._min_feature_count)
self._model_kwargs: Dict[str, Any] = {}
self._sampler_kwargs: Dict[str, Any] = {}
self.logger = logger if logger else logging.getLogger(__name__)
if model_type not in globals():
# here, we need to import the custom model and add it
# to the appropriate lists of models
globals()[model_type] = load_custom_learner(custom_learner_path, model_type)
model_class = globals()[model_type]
default_param_grid = (
model_class.default_param_grid()
if hasattr(model_class, "default_param_grid")
else {}
)
# ewww, globals :-(
global _REQUIRES_DENSE
_DEFAULT_PARAM_GRIDS.update({model_class: default_param_grid})
if hasattr(model_class, "requires_dense") and model_class.requires_dense():
_REQUIRES_DENSE = _REQUIRES_DENSE + (model_class,)
self._model_type = globals()[model_type]
# Use setter to set self.probability
self.probability = probability
# we need to use dense features under certain conditions:
# - if we are using any of the estimators that are _known_
# to accept only dense features
# - if we are doing centering as part of feature scaling
# - if we are using non-negative least squares regression
self._use_dense_features = (
issubclass(self._model_type, _REQUIRES_DENSE)
or self._feature_scaling in {"with_mean", "both"}
or (
issubclass(self._model_type, LinearRegression)
and model_kwargs is not None
and model_kwargs.get("positive", False)
)
)
# Set default keyword arguments for models that we have some for.
if issubclass(self._model_type, SVC):
self._model_kwargs["cache_size"] = 1000
self._model_kwargs["probability"] = self.probability
self._model_kwargs["gamma"] = "scale"
if self.probability:
self.logger.warning(
"Because LibSVM does an internal cross-validation to "
"produce probabilities, results will not be exactly "
"replicable when using SVC and probability mode."
)
elif issubclass(self._model_type, AdaBoostClassifier):
self._model_kwargs["algorithm"] = "SAMME"
self._model_kwargs["n_estimators"] = 500
elif issubclass(
self._model_type,
(
AdaBoostRegressor,
BaggingClassifier,
BaggingRegressor,
GradientBoostingClassifier,
GradientBoostingRegressor,
RandomForestClassifier,
RandomForestRegressor,
),
):
self._model_kwargs["n_estimators"] = 500
elif issubclass(self._model_type, DummyClassifier):
self._model_kwargs["strategy"] = "prior"
elif issubclass(self._model_type, (LinearSVC, LinearSVR)):
self._model_kwargs["dual"] = "auto"
elif issubclass(self._model_type, SVR):
self._model_kwargs["cache_size"] = 1000
self._model_kwargs["gamma"] = "scale"
elif issubclass(self._model_type, SGDClassifier):
self._model_kwargs["loss"] = "log_loss"
self._model_kwargs["max_iter"] = 1000
self._model_kwargs["tol"] = 1e-3
elif issubclass(self._model_type, SGDRegressor):
self._model_kwargs["max_iter"] = 1000
self._model_kwargs["tol"] = 1e-3
elif issubclass(self._model_type, RANSACRegressor):
self._model_kwargs["loss"] = "squared_error"
elif issubclass(self._model_type, (MLPClassifier, MLPRegressor)):
self._model_kwargs["learning_rate"] = "invscaling"
self._model_kwargs["max_iter"] = 500
elif issubclass(self._model_type, LogisticRegression):
self._model_kwargs["max_iter"] = 1000
self._model_kwargs["solver"] = "liblinear"
self._model_kwargs["multi_class"] = "auto"
if issubclass(
self._model_type,
(
AdaBoostClassifier,
AdaBoostRegressor,
BaggingClassifier,
BaggingRegressor,
DecisionTreeClassifier,
DecisionTreeRegressor,
DummyClassifier,
ElasticNet,
GradientBoostingClassifier,
GradientBoostingRegressor,
HistGradientBoostingClassifier,
HistGradientBoostingRegressor,
Lasso,
LinearSVC,
LinearSVR,
LogisticRegression,
MLPClassifier,
MLPRegressor,
RandomForestClassifier,
RandomForestRegressor,
RANSACRegressor,
Ridge,
RidgeClassifier,
SGDClassifier,
SGDRegressor,
SVC,
TheilSenRegressor,
),
):
self._model_kwargs["random_state"] = 123456789
if sampler_kwargs:
self._sampler_kwargs.update(sampler_kwargs)
if sampler:
sampler_type = globals()[sampler]
if issubclass(sampler_type, (Nystroem, RBFSampler, SkewedChi2Sampler)):
self._sampler_kwargs["random_state"] = 123456789
self.sampler = sampler_type(**self._sampler_kwargs)
else:
self.sampler = None
if model_kwargs:
# if the model is an AdaBoostClassifier, AdaBoostRegressor,
# BaggingClassifier, BaggingRegressor, or RANSACRegressor,
# then we need to convert the specified `estimator` string
# into an object before passing it in to the learner constructor.
# We also need to make sure where appropriate, we set the random
# state to a fixed seed such that results are replicable
is_ada_has_estimator = (
issubclass(self._model_type, (AdaBoostRegressor, AdaBoostClassifier))
and "estimator" in model_kwargs
)
is_ransac_has_estimator = (
issubclass(self._model_type, RANSACRegressor) and "estimator" in model_kwargs
)
is_bagging_has_estimator = (
issubclass(self._model_type, (BaggingClassifier, BaggingRegressor))
and "estimator" in model_kwargs
)
if is_ada_has_estimator or is_ransac_has_estimator or is_bagging_has_estimator:
base_estimator_kwargs: Dict[str, Any]
# check if a base estimator name was specified
base_estimator_name = model_kwargs.get("estimator")
# set some fixed parameters for specific base estimators
if base_estimator_name in ["LinearRegression", "MultinomialNB"]:
base_estimator_kwargs = {}
elif base_estimator_name in ["SGDClassifier", "SGDRegressor"]:
base_estimator_kwargs = {
"max_iter": 1000,
"tol": 0.001,
"random_state": 123456789,
}
elif base_estimator_name == "SVR":
base_estimator_kwargs = {"gamma": "scale"}
elif base_estimator_name == "SVC":
base_estimator_kwargs = {"gamma": "scale", "random_state": 123456789}
else:
base_estimator_kwargs = {"random_state": 123456789}
# instantiate a base estimator if one was specified and add it
# to the main learner's model keyword arguments
if base_estimator_name:
base_estimator = globals()[base_estimator_name](**base_estimator_kwargs)
model_kwargs["estimator"] = base_estimator
self._model_kwargs.update(model_kwargs)
[docs]
@classmethod
def from_file(
cls, learner_path: PathOrStr, logger: Optional[logging.Logger] = None
) -> "Learner":
"""
Load a saved ``Learner`` instance from a file path.
Parameters
----------
learner_path : :class:`skll.types.PathOrStr`
The path to a saved ``Learner`` instance file.
logger : Optional[logging.Logger], default=None
A logging object. If ``None`` is passed, get logger from ``__name__``.
Returns
-------
:class:`skll.learner.Learner`
The ``Learner`` instance loaded from the file.
"""
# use the logger that's passed in or if nothing was passed in,
# then create a new logger
logger = logger if logger else logging.getLogger(__name__)
# call the learner loding utility function
obj = _load_learner_from_disk(cls, learner_path, logger)
assert isinstance(obj, cls)
return obj
@property
def model_type(self):
"""Return the model type (i.e., the class)."""
return self._model_type
@property
def model_kwargs(self) -> Dict[str, Any]:
"""Return a dictionary of the underlying scikit-learn model's keyword arguments."""
return self._model_kwargs
@property
def model(self):
"""Return the underlying scikit-learn model."""
return self._model
[docs]
def load(self, learner_path: PathOrStr) -> None:
"""
Replace the current learner instance with a saved learner.
Parameters
----------
learner_path : :class:`skll.types.PathOrStr`
The path to a saved learner object file to load.
"""
del self.__dict__
self.__dict__ = Learner.from_file(learner_path).__dict__
def _convert_coef_array_to_feature_names(self, coef: np.ndarray, feature_name_prefix: str = ""):
"""
Convert model coefficients array to dictionary.
Method used by `model_params` to convert the model
coefficients array into a dictionary with feature names as
keys and the coefficients as values.
Parameters
----------
coef : numpy.ndarray
A numpy array with the model coefficients
feature_name_prefix : str
An optional string that should be prefixed to the feature
name, e.g. the name of the class for LogisticRegression
or the class pair for SVCs with linear kernels.
Returns
-------
Dict[str, Any]
A dictionary of labeled weights
"""
res = {}
vocabulary = {}
# if we are doing feature hashing, then we need to make up
# the feature names
if isinstance(self.feat_vectorizer, FeatureHasher):
num_features = len(coef)
index_width_in_feature_name = int(floor(log10(num_features))) + 1
feature_names = []
for idx in range(num_features):
index_str = str(idx + 1).zfill(index_width_in_feature_name)
feature_names.append(f"hashed_feature_{index_str}")
feature_indices = range(num_features)
vocabulary = dict(zip(feature_names, feature_indices))
# otherwise we can just use the DictVectorizer vocabulary
# to get the feature names
elif isinstance(self.feat_vectorizer, DictVectorizer):
vocabulary = self.feat_vectorizer.vocabulary_
# create the final result dictionary with the prefixed
# feature names and the corresponding coefficient
for feat, idx in vocabulary.items():
if coef[idx]:
res[f"{feature_name_prefix}{feat}"] = coef[idx]
return res
@property
def model_params(self) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""
Return model parameters (i.e., weights).
Return the weights for a ``LinearModel`` (e.g., ``Ridge``),
regression, and liblinear models. If the model was trained using feature
hashing, then names of the form `hashed_feature_XX` are used instead.
Returns
-------
res : Dict[str, Any]
A dictionary of labeled weights.
intercept : Dict[str, Any]
A dictionary of intercept(s).
Raises
------
ValueError
If the instance does not support model parameters.
"""
res = {}
intercept = {}
if (
isinstance(self._model, LinearModel)
or (isinstance(self._model, SVR) and self._model.kernel == "linear")
or isinstance(self._model, SGDRegressor)
):
# also includes RescaledRidge, RescaledSVR, RescaledSGDRegressor
coef = self.model.coef_
intercept = {"_intercept_": self.model.intercept_}
# convert SVR coefficient from a matrix to a 1D array
# and convert from sparse to dense also if necessary.
# However, this last bit may not be necessary
# if we did feature scaling and coef is already dense.
if isinstance(self._model, SVR):
if sp.issparse(coef):
coef = coef.toarray()
coef = coef[0]
# inverse transform to get indices for before feature selection
coef = coef.reshape(1, -1)
coef = self.feat_selector.inverse_transform(coef)[0]
res = self._convert_coef_array_to_feature_names(coef)
elif isinstance(self._model, LinearSVC) or isinstance(self._model, LogisticRegression):
label_list = self.label_list
# if there are only two labels, scikit-learn will only have one
# set of parameters and they will be associated with label 1 (not
# 0)
if len(self.label_list) == 2:
label_list = self.label_list[-1:]
if isinstance(self.feat_vectorizer, FeatureHasher):
self.logger.warning(
"No feature names are available since "
"this model was trained on hashed "
"features."
)
for i, label in enumerate(label_list):
coef = self.model.coef_[i]
coef = coef.reshape(1, -1)
coef = self.feat_selector.inverse_transform(coef)[0]
label_res = self._convert_coef_array_to_feature_names(
coef, feature_name_prefix=f"{label}\t"
)
res.update(label_res)
if isinstance(self.model.intercept_, float):
intercept = {"_intercept_": self.model.intercept_}
elif self.model.intercept_.any():
intercept = dict(zip(label_list, self.model.intercept_)) # type: ignore
# for SVCs with linear kernels, we want to print out the primal
# weights - that is, the weights for each feature for each one-vs-one
# binary classifier. These are the weights contained in the `coef_`
# attribute of the underlying scikit-learn model. This is a matrix that
# has the shape [(n_classes)*(n_classes -1)/2, n_features] since there
# are C(n_classes, 2) = n_classes*(n_classes-1)/2 one-vs-one classifiers
# and each one has weights for each of the features. According to the
# scikit-learn user guide and the code for the function `_one_vs_one_coef()`
# in `svm/base.py`, the order of the rows is as follows is "0 vs 1",
# "0 vs 2", ... "0 vs n", "1 vs 2", "1 vs 3", "1 vs n", ... "n-1 vs n".
elif isinstance(self._model, SVC) and self._model.kernel == "linear":
intercept = {}
if isinstance(self.feat_vectorizer, FeatureHasher):
self.logger.warning(
"No feature names are available since "
"this model was trained on hashed "
"features."
)
for i, class_pair in enumerate(combinations(range(len(self.label_list)), 2)):
coef = self.model.coef_[i]
coef = coef.toarray()
coef = self.feat_selector.inverse_transform(coef)[0]
class1 = self.label_list[class_pair[0]]
class2 = self.label_list[class_pair[1]]
class_pair_res = self._convert_coef_array_to_feature_names(
coef, feature_name_prefix=f"{class1}-vs-{class2}\t"
)
res.update(class_pair_res)
intercept[f"{class1}-vs-{class2}"] = self.model.intercept_[i]
else:
# not supported
raise ValueError(
f"{self._model_type.__name__} is not supported "
"by model_params with its current settings."
)
return res, intercept
@property
def probability(self) -> bool:
"""
Return the value of the probability flag.
The flag indicages whether the learner return probabilities of all
labels (instead of just label with highest probability)?
"""
return self._probability
@probability.setter
def probability(self, value: bool) -> None:
"""
Set the probability flag.
Parameters
----------
value : bool
Whether learner should return probabilities of all labels.
"""
# LinearSVC doesn't support predict_proba
self._probability = value
if not hasattr(self.model_type, "predict_proba") and value:
self.logger.warning(
"Probability was set to True, but "
f"{self.model_type.__name__} does not have a "
"predict_proba() method."
)
self._probability = False
def __getstate__(self) -> Dict[str, Any]:
"""
Return attributes that should be pickled.
We need this because we cannot pickle loggers.
"""
attribute_dict = dict(self.__dict__)
if "logger" in attribute_dict:
del attribute_dict["logger"]
return attribute_dict
[docs]
def save(self, learner_path: PathOrStr) -> None:
"""
Save the ``Learner`` instance to a file.
Parameters
----------
learner_path : :class:`skll.types.PathOrStr`
The path to save the ``Learner`` instance to.
"""
_save_learner_to_disk(self, learner_path)
def _create_estimator(self):
"""
Create an estimator.
Returns
-------
estimator
The estimator that was created.
default_param_grid : Dict[str, Any]
The parameter grid for the estimator.
Raises
------
ValueError
If there is no default parameter grid for estimator.
"""
estimator = None
default_param_grid = None
for key_class, grid in _DEFAULT_PARAM_GRIDS.items():
if issubclass(self._model_type, key_class):
default_param_grid = grid
if default_param_grid is None:
raise ValueError(f"{self._model_type.__name__} is not a valid " "learner type.")
estimator = self._model_type(**self._model_kwargs)
return estimator, default_param_grid
[docs]
def get_feature_names_out(self) -> np.ndarray:
"""
Return the names of the actual features used by the estimator.
It is possible for some features to get filtered out by the
feature selector which means that the vectorizer is no
longer the correct source for the feature names. This
method takes into account the feature selector and returns
the names of the features that were actually selected to be
used by the estimator.
Returns
-------
names : numpy.ndarray of shape (num_features,)
Names of features actually used by the estimator.
Raises
------
ValueError
If ``self.feat_vectorizer`` is either ``None`` or a
:class:`sklearn.feature_extraction.FeatureHasher`.
"""
if isinstance(self.feat_vectorizer, DictVectorizer):
return self.feat_vectorizer.get_feature_names_out()[self.feat_selector.get_support()]
else:
raise ValueError(
"Cannot get feature names: `feat_vectorizer` is not "
"defined or a `FeatureHasher`."
)
def _check_input_formatting(self, examples: FeatureSet) -> None:
"""
Check that the examples are properly formatted.
Parameters
----------
examples : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to use for training.
Raises
------
TypeError
If labels are strings.
TypeError
If any features are strings.
"""
# Make sure the labels for a regression task are not strings.
if self.model_type._estimator_type == "regressor" and examples.labels is not None:
for label in examples.labels:
if isinstance(label, str):
raise TypeError(
"You are doing regression with string "
"labels. Convert them to integers or "
"floats."
)
# make sure that feature values are not strings; to check this
# we need to get a flattened version of the feature array,
# whether it is sparse (more likely) or dense
if examples.features is not None:
if sp.issparse(examples.features):
flattened_features = examples.features.data
else:
flattened_features = examples.features.flat
for val in flattened_features:
if isinstance(val, str):
raise TypeError(
"You have feature values that are strings. Convert them to floats."
)
def _check_max_feature_value(self, feat_array: np.ndarray):
"""
Check if the the maximum absolute value of any feature is too large.
Parameters
----------
feat_array : numpy.ndarray
A numpy array with features.
"""
max_feat_abs = np.max(np.abs(feat_array.data))
if max_feat_abs > 1000.0:
self.logger.warning(
"You have a feature with a very large "
f"absolute value ({max_feat_abs}). That may "
"cause the learning algorithm to crash or "
"perform poorly."
)
def _create_label_dict(self, examples: FeatureSet) -> None:
"""
Create a dictionary of labels for classification problems.
Parameters
----------
examples : :class:`skll.data.featureset.FeatureSet`
The examples to use for training.
"""
# we don't need to do this if we have already done it
# or for regression models, so simply return.
if len(self.label_dict) > 0 or self.model_type._estimator_type == "regressor":
return
# extract list of unique labels if we are doing classification;
# note that the output of np.unique() is sorted
if examples.labels is not None:
self.label_list = np.unique(examples.labels).tolist()
# for binary classification, if one label is specified as
# the positive class, re-sort the label list to make sure
# that it is last in the list; for multi-class classification
# raise a warning and set it back to None, since it does not
# make any sense anyway
if self.pos_label is not None:
if len(self.label_list) != 2:
self.logger.warning(
"Ignoring value of `pos_label` for " "multi-class classification."
)
self.pos_label = None
else:
self.label_list = sorted(self.label_list, key=lambda x: (x == self.pos_label, x))
# Given a list of all labels in the dataset and a list of the
# unique labels in the set, convert the first list to an array of
# numbers.
self.label_dict = {label: i for i, label in enumerate(self.label_list)}
def _train_setup(self, examples: FeatureSet) -> None:
"""
Set up the feature vectorizer and the scaler.
Parameters
----------
examples : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to use for training.
"""
# Check feature values and labels
self._check_input_formatting(examples)
# Create feature name -> value mapping
self.feat_vectorizer = examples.vectorizer
# Create a scaler if we weren't passed one and we are asked
# to do feature scaling; note that we do not support feature
# scaling for `MultinomialNB` learners
if not issubclass(self._model_type, MultinomialNB) and self._feature_scaling != "none":
scale_with_mean = self._feature_scaling in {"with_mean", "both"}
scale_with_std = self._feature_scaling in {"with_std", "both"}
self.scaler = StandardScaler(
copy=True, with_mean=scale_with_mean, with_std=scale_with_std
)
else:
# Doing this is to prevent any modification of feature values
# using a dummy transformation
self.scaler = StandardScaler(copy=False, with_mean=False, with_std=False)
[docs]
def train(
self,
examples: FeatureSet,
param_grid: Optional[Dict[str, Any]] = None,
grid_search_folds: Union[int, FoldMapping] = 5,
grid_search: bool = True,
grid_objective: Optional[str] = None,
grid_jobs: Optional[int] = None,
shuffle: bool = False,
) -> Tuple[float, Dict[str, Any]]:
"""
Train model underlying the learner.
Return the grid search score and a dictionary of grid search results.
Parameters
----------
examples : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to use for training.
param_grid : Optional[Dict[str, Any]], default=None
The parameter grid to search through for grid
search. If ``None``, a default parameter grid
will be used.
grid_search_folds : Union[int, :class:`skll.types.FoldMapping`], default=5
The number of folds to use when doing the
grid search, or a mapping from example IDs to folds.
grid_search : bool, default=True
Should we do grid search?
grid_objective : Optional[str], default=None
The name of the objective function to use when
doing the grid search. Must be specified if
``grid_search`` is ``True``.
grid_jobs : Optional[int], default=None
The number of jobs to run in parallel when doing the
grid search. If ``None`` or 0, the number of
grid search folds will be used.
shuffle : bool, default=False
Shuffle examples (e.g., for grid search CV.)
Returns
-------
float
The best grid search objective function score, or 0 if
we're not doing grid search
Dict[str, Any]
Dictionary of grid search CV results with keys such as "params",
"mean_test_score", etc, that are mapped to lists of values
associated with each hyperparameter set combination, or
None if not doing grid search.
Raises
------
ValueError
If grid_objective is not a valid grid objective or if
one is not specified when necessary.
MemoryError
If process runs out of memory converting training data to dense.
ValueError
If FeatureHasher is used with MultinomialNB.
"""
# get the estimator type since we need it in multiple places below
estimator_type = self.model_type._estimator_type
# if we are asked to do grid search, check that the grid objective
# is specified and that the specified function is valid for the
# selected learner
if grid_search:
if not grid_objective:
raise ValueError(
"Grid search is on by default. You must "
"either specify a grid objective or turn off"
" grid search."
)
# get the list of objectives that are acceptable in the current
# prediction scenario and raise an exception if the current
# objective is not in this allowed list
if examples.labels is not None:
label_type = examples.labels.dtype.type
if estimator_type == "classifier":
sorted_unique_labels = np.unique(examples.labels)
allowed_objectives = get_acceptable_classification_metrics(sorted_unique_labels)
else:
allowed_objectives = get_acceptable_regression_metrics()
if grid_objective not in allowed_objectives:
raise ValueError(
f"'{grid_objective}' is not a valid objective"
f" function for {self._model_type.__name__} "
"with labels of type "
f"{label_type.__name__}."
)
# If we're using a correlation metric for doing binary
# classification and probability is set to true, we assume
# that the user actually wants the `_with_probabilities`
# version of the metric
if (
grid_objective in CORRELATION_METRICS
and estimator_type == "classifier"
and self.probability
):
self.logger.info(
f'You specified "{grid_objective}" as the '
'objective with "probability" set to "true".'
" If this is a binary classification task "
"with integer labels, the probabilities for "
"the positive class will be used to compute "
"the correlation."
)
old_grid_objective = grid_objective
new_grid_objective = f"{grid_objective}_probs"
metrics_module = import_module("skll.metrics")
metric_func = getattr(metrics_module, "correlation")
_CUSTOM_METRICS[new_grid_objective] = make_scorer(
metric_func, corr_type=grid_objective, response_method="predict_proba"
)
grid_objective = new_grid_objective
# Shuffle so that the folds are random for the inner grid search CV.
# If grid search is True but shuffle isn't, shuffle anyway.
# You can't shuffle a scipy sparse matrix in place, so unfortunately
# we make a copy of everything (and then get rid of the old version)
if grid_search or shuffle:
if grid_search and not shuffle:
self.logger.warning(
"Training data will be shuffled to randomize "
"grid search folds. Shuffling may yield "
"different results compared to scikit-learn."
)
ids, labels, features = sk_shuffle(
examples.ids, examples.labels, examples.features, random_state=123456789
)
examples = FeatureSet(
examples.name, ids, labels=labels, features=features, vectorizer=examples.vectorizer
)
# call train setup to set up the vectorizer, the labeldict, and the
# scaler
self._create_label_dict(examples)
self._train_setup(examples)
# select features
xtrain = self.feat_selector.fit_transform(examples.features)
# Convert to dense if necessary
if self._use_dense_features:
try:
xtrain = xtrain.toarray()
except MemoryError:
if issubclass(self._model_type, _REQUIRES_DENSE):
reason = f"{self._model_type.__name__} does not support " "sparse matrices."
else:
reason = f"{self._feature_scaling} feature scaling " "requires a dense matrix."
raise MemoryError(
"Ran out of memory when converting training"
" data to dense. This was required because "
f"{reason}"
)
if isinstance(self.feat_vectorizer, FeatureHasher) and issubclass(
self._model_type, MultinomialNB
):
raise ValueError(
"Cannot use FeatureHasher with MultinomialNB "
"because MultinomialNB cannot handle negative "
"feature values."
)
# Scale features if necessary
if self.scaler:
xtrain = self.scaler.fit_transform(xtrain)
# check whether any feature values are too large
self._check_max_feature_value(xtrain)
# Sampler
if self.sampler is not None and issubclass(self._model_type, MultinomialNB):
raise ValueError(
"Cannot use a sampler with MultinomialNB "
"because MultinomialNB cannot handle negative "
"feature values."
)
if self.sampler:
self.logger.warning("Sampler converts sparse matrix to dense")
if isinstance(self.sampler, SkewedChi2Sampler):
self.logger.warning("SkewedChi2Sampler uses a dense matrix")
if sp.issparse(xtrain):
xtrain = xtrain.toarray()
xtrain = self.sampler.fit_transform(xtrain)
# use label dict transformed version of examples.labels if doing
# classification
if examples.labels is not None:
if estimator_type == "classifier":
labels = np.array([self.label_dict[label] for label in examples.labels])
else:
labels = examples.labels
# Instantiate an estimator and get the default parameter grid to search
estimator, default_param_grid = self._create_estimator()
# Use default parameter grid if we weren't passed one
# In case the default parameter grid is also empty
# then there's no point doing the grid search at all
if grid_search and not param_grid:
if default_param_grid == {}:
self.logger.warning(
"SKLL has no default parameter grid "
"available for the "
f"{self._model_type.__name__} learner and"
" no parameter grids were supplied. Using"
" default values instead of grid search."
)
grid_search = False
else:
param_grid = default_param_grid
# set up a grid searcher if we are asked to
if grid_search:
# explicitly declare the variable types
folds: Union[int, IndexIterator]
final_grid_jobs: int
# set up grid search folds
if isinstance(grid_search_folds, int):
grid_search_folds = compute_num_folds_from_example_counts(
grid_search_folds, labels, self.model_type._estimator_type, logger=self.logger
)
if not grid_jobs:
final_grid_jobs = grid_search_folds
else:
final_grid_jobs = min(grid_search_folds, grid_jobs)
folds = grid_search_folds
elif examples.labels is not None:
# use the number of unique fold IDs as the number of grid jobs
num_specified_folds = len(set(grid_search_folds.values()))
if not grid_jobs:
final_grid_jobs = num_specified_folds
else:
final_grid_jobs = min(num_specified_folds, grid_jobs)
# Only retain IDs within folds if they're in grid_search_folds
dummy_label = next(iter(grid_search_folds.values()))
fold_groups = [
grid_search_folds.get(curr_id, dummy_label) for curr_id in examples.ids
]
kfold = FilteredLeaveOneGroupOut(
grid_search_folds, examples.ids, logger=self.logger
)
folds = kfold.split(examples.features, examples.labels, fold_groups)
# limit the number of grid_jobs to be no higher than five or the
# number of cores for the machine, whichever is lower
final_grid_jobs = min(final_grid_jobs, cpu_count(), MAX_CONCURRENT_PROCESSES)
# look up the scorer function in SKLL's custom metrics if the metric
# is not provided by scikit-learn itself
assert grid_objective is not None
final_grid_objective = (
grid_objective
if grid_objective in get_scorer_names()
else _CUSTOM_METRICS[grid_objective]
)
# we set `error_score` to "raise" since we want scikit-learn to explicitly
# raise an exception if the estimator fails to fit for any reason
grid_searcher = GridSearchCV(
estimator,
param_grid,
scoring=final_grid_objective,
cv=folds,
n_jobs=final_grid_jobs,
error_score="raise",
pre_dispatch=final_grid_jobs,
)
# run the grid search for hyperparameters
grid_searcher.fit(xtrain, labels)
self._model = grid_searcher.best_estimator_
grid_score = grid_searcher.best_score_
grid_cv_results = grid_searcher.cv_results_
else:
self._model = estimator.fit(xtrain, labels)
grid_score = 0.0
grid_cv_results = None
# restore the original of the grid objective if we
# had futzed with it to handle correlation
# objectives and probability outputs
if "old_grid_objective" in locals():
grid_objective = old_grid_objective
del _CUSTOM_METRICS[new_grid_objective]
# store a scikit-learn Pipeline in the `pipeline` attribute
# composed of a copy of the vectorizer, the selector,
# the sampler, the scaler, and the estimator. This pipeline
# attribute can then be used by someone who wants to take a SKLL
# model and then do further analysis using scikit-learn
# We are using copies since the user might want to play
# around with the pipeline and we want to let her do that
# but keep the SKLL model the same
if self._store_pipeline:
# initialize the list that will hold the pipeline steps
pipeline_steps: List[Tuple[str, Any]] = []
# start with the vectorizer
# note that sometimes we may have to end up using dense
# features or if we were using a SkewedChi2Sampler which
# requires dense inputs. If this turns out to be the case
# then let's turn off `sparse` for the vectorizer copy
# to be stored in the pipeline as well so that it works
# on the scikit-learn in the same way. However, note that
# this solution will only work for DictVectorizers. For
# feature hashers, we manually convert things to dense
# when we need in SKLL. Therefore, to handle this case,
# we basically need to create a custom intermediate
# pipeline stage that will convert the features to dense
# once the hashing is done since this is what happens
# in SKLL.
vectorizer_copy = copy.deepcopy(self.feat_vectorizer)
if self._use_dense_features or isinstance(self.sampler, SkewedChi2Sampler):
if isinstance(vectorizer_copy, DictVectorizer):
self.logger.warning(
"The `sparse` attribute of the DictVectorizer stage "
"will be set to `False` in the pipeline since dense "
"features are required when centering."
)
vectorizer_copy.sparse = False
else:
self.logger.warning(
"A custom pipeline stage (`Densifier`) will be "
"inserted in the pipeline since the current SKLL "
"configuration requires dense features."
)
densifier = Densifier()
pipeline_steps.append(("densifier", densifier))
pipeline_steps.insert(0, ("vectorizer", vectorizer_copy))
# next add the selector
pipeline_steps.append(("selector", copy.deepcopy(self.feat_selector)))
# next, include the scaler
pipeline_steps.append(("scaler", copy.deepcopy(self.scaler)))
# next, include the sampler, if there is one
if self.sampler:
pipeline_steps.append(("sampler", copy.deepcopy(self.sampler)))
# finish with the estimator
pipeline_steps.append(("estimator", copy.deepcopy(self.model)))
self.pipeline = Pipeline(steps=pipeline_steps)
return grid_score, grid_cv_results
[docs]
def evaluate(
self,
examples: FeatureSet,
prediction_prefix: Optional[str] = None,
append: bool = False,
grid_objective: Optional[str] = None,
output_metrics: List[str] = [],
) -> EvaluateTaskResults:
"""
Evaluate the learner on a given dev or test ``FeatureSet``.
Parameters
----------
examples : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to evaluate the performance of the
model on.
prediction_prefix : Optional[str], default=None
If not ``None``, predictions will also be written out to a file with
the name ``<prediction_prefix>_predictions.tsv``. Note that
the prefix can also contain a path.
append : bool, default=False
Should we append the current predictions to the file if it exists?
grid_objective : Optional[str], default=None
The objective function that was used when doing the grid search.
output_metrics : List[str], default=[]
List of additional metric names to compute in addition to grid
objective.
Returns
-------
:class:`skll.types.EvaluateTaskResults`
A 6-tuple containing the confusion matrix, the overall accuracy,
the per-label PRFs, the model parameters, the grid search objective
function score, and the additional evaluation metrics, if any.
For regressors, the first two elements in the tuple are ``None``.
"""
# are we in a regressor or a classifier
estimator_type = self.model_type._estimator_type
# make the prediction on the test data; note that these
# are either class indices or class probabilities
yhat = self.predict(
examples, prediction_prefix=prediction_prefix, append=append, class_labels=False
)
# for classifiers, convert class labels indices for consistency
# but account for any unseen labels in the test set that may not
# have occurred in the training data at all; then get acceptable
# metrics based on the type of labels we have
if examples.labels is not None:
if estimator_type == "classifier":
sorted_unique_labels = np.unique(examples.labels)
test_label_list = sorted_unique_labels.tolist()
train_and_test_label_dict = add_unseen_labels(self.label_dict, test_label_list)
ytest = np.array([train_and_test_label_dict[label] for label in examples.labels])
acceptable_metrics = get_acceptable_classification_metrics(sorted_unique_labels)
# for regressors we do not need to do anything special to the labels
else:
train_and_test_label_dict = None
ytest = examples.labels
acceptable_metrics = get_acceptable_regression_metrics()
# check that all of the output metrics are acceptable
unacceptable_metrics = set(output_metrics).difference(acceptable_metrics)
if unacceptable_metrics and examples.labels is not None:
label_type = examples.labels.dtype.type
raise ValueError(
"The following metrics are not valid "
f"for this learner ({self._model_type.__name__})"
" with these labels of type "
f"{label_type.__name__}: "
f"{list(unacceptable_metrics)}"
)
# get the values of the evaluation metrics
(
conf_matrix,
accuracy,
result_dict,
objective_score,
metric_scores,
) = compute_evaluation_metrics(
output_metrics,
ytest,
yhat,
estimator_type,
label_dict=train_and_test_label_dict,
grid_objective=grid_objective,
probability=self.probability,
logger=self.logger,
)
# add in the model parameters and return
model_params: Dict[str, Any] = self.model.get_params()
res = (conf_matrix, accuracy, result_dict, model_params, objective_score, metric_scores)
return res
[docs]
def predict(
self,
examples: FeatureSet,
prediction_prefix: Optional[str] = None,
append: bool = False,
class_labels: bool = True,
) -> np.ndarray:
"""
Generate predictions for the given examples using the learner model.
Return, and optionally, write out predictions on a given ``FeatureSet``
to a file. For regressors, the returned and written-out predictions are
identical. However, for classifiers:
- if ``class_labels`` is ``True``, class labels are returned
as well as written out.
- if ``class_labels`` is ``False`` and the classifier is probabilistic
(i.e., ``self..probability`` is ``True``), class probabilities are
returned as well as written out.
- if ``class_labels`` is ``False`` and the classifier is non-probabilistic
(i.e., ``self..probability`` is ``False``), class indices are returned
and class labels are written out.
TL;DR: for regressors, just ignore ``class_labels``. For classfiers,
set it to ``True`` to get class labels and ``False`` to get class
probabilities.
Parameters
----------
examples : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to predict labels for.
prediction_prefix : Optional[str], default=None
If not ``None``, predictions will also be written out to a file with
the name ``<prediction_prefix>_predictions.tsv``. For classifiers,
the predictions written out are class labels unless the learner is
probabilistic AND ``class_labels`` is set to ``False``. Note that
this prefix can also contain a path.
append : bool, default=False
Should we append the current predictions to the file if it exists?
class_labels : bool, default=True
If ``False``, return either the class probabilities (probabilistic
classifiers) or the class indices (non-probabilistic ones). If
``True``, return the class labels no matter what. Ignored for
regressors.
Returns
-------
numpy.ndarray
The predictions returned by the ``Learner`` instance.
Raises
------
AssertionError
If invalid predictions are being returned or written out.
MemoryError
If process runs out of memory when converting to dense.
RuntimeError
If there is a mismatch between the learner vectorizer
and the test set vectorizer.
"""
example_ids = examples.ids
# Need to do some transformations so the features are in the right
# columns for the test set. Obviously a bit hacky, but storing things
# in sparse matrices saves memory over our old list of dicts approach.
# We also need to think about the various combinations of the model
# vectorizer and the vectorizer for the set for which we want to make
# predictions:
# 1. Both vectorizers are DictVectorizers. If they use different sets
# of features, we raise a warning and transform the features of the
# prediction set from its space to the trained model space.
# 2. Both vectorizers are FeatureHashers. If they use different number
# of feature bins, we should just raise an error since there's no
# inverse_transform() available for a FeatureHasher - the hash function
# is not reversible.
# 3. The model vectorizer is a FeatureHasher but the prediction feature
# set vectorizer is a DictVectorizer. We should be able to handle this
# case, since we can just call inverse_transform() on the DictVectorizer
# and then transform() on the FeatureHasher?
# 4. The model vectorizer is a DictVectorizer but the prediction feature
# set vectorizer is a FeatureHasher. Again, we should raise an error here
# since there's no inverse available for the hasher.
# 1. both are DictVectorizers
if isinstance(self.feat_vectorizer, DictVectorizer) and isinstance(
examples.vectorizer, DictVectorizer
):
if set(self.feat_vectorizer.feature_names_) != set(examples.vectorizer.feature_names_):
self.logger.warning(
"There is mismatch between the training model features "
"and the data passed to predict. The prediction features "
"will be transformed to the trained model space."
)
if self.feat_vectorizer == examples.vectorizer:
xtest = examples.features
else:
xtest = self.feat_vectorizer.transform(
examples.vectorizer.inverse_transform(examples.features)
)
# 2. both are FeatureHashers
elif isinstance(self.feat_vectorizer, FeatureHasher) and isinstance(
examples.vectorizer, FeatureHasher
):
self_feat_vec_tuple = (
self.feat_vectorizer.dtype,
self.feat_vectorizer.input_type,
self.feat_vectorizer.n_features,
)
example_feat_vec_tuple = (
examples.vectorizer.dtype,
examples.vectorizer.input_type,
examples.vectorizer.n_features,
)
if self_feat_vec_tuple == example_feat_vec_tuple:
xtest = examples.features
else:
self.logger.error(
"There is mismatch between the FeatureHasher "
"configuration for the training data and the "
"configuration for the data passed to predict"
)
raise RuntimeError("Mismatched hasher configurations")
# 3. model is a FeatureHasher and test set is a DictVectorizer
elif isinstance(self.feat_vectorizer, FeatureHasher) and isinstance(
examples.vectorizer, DictVectorizer
):
xtest = self.feat_vectorizer.transform(
examples.vectorizer.inverse_transform(examples.features)
)
# 4. model is a DictVectorizer and test set is a FeatureHasher
elif isinstance(self.feat_vectorizer, DictVectorizer) and isinstance(
examples.vectorizer, FeatureHasher
):
self.logger.error(
"Cannot predict with a model using a "
"DictVectorizer on data that uses a "
"FeatureHasher"
)
raise RuntimeError("Cannot use FeatureHasher for data")
# filter features based on those selected from training set
xtest = self.feat_selector.transform(xtest)
# Convert to dense if necessary
if self._use_dense_features and not isinstance(xtest, np.ndarray):
try:
xtest = xtest.toarray()
except MemoryError:
if issubclass(self._model_type, _REQUIRES_DENSE):
reason = f"{self._model_type.__name__} does not support " "sparse matrices."
else:
reason = f"{self._feature_scaling} feature scaling " "requires a dense matrix."
raise MemoryError(
"Ran out of memory when converting test "
"data to dense. This was required because "
f"{reason}"
)
# Scale xtest if necessary
if not issubclass(self._model_type, MultinomialNB) and self.scaler:
xtest = self.scaler.transform(xtest)
# Sampler
if self.sampler:
self.logger.warning("Sampler converts sparse matrix to dense")
if isinstance(self.sampler, SkewedChi2Sampler):
self.logger.warning("SkewedChi2Sampler uses a dense matrix")
if sp.issparse(xtest):
xtest = xtest.toarray()
xtest = self.sampler.transform(xtest)
# get the various prediction from this learner on these features
prediction_dict = get_predictions(self, xtest)
# for classifiers ...
if self.model_type._estimator_type == "classifier":
# return and write class labels if they were explicitly asked for
if class_labels:
to_return = to_write = prediction_dict["labels"]
else:
# return and write probabilities
if self.probability:
to_return = to_write = prediction_dict["probabilities"]
# return class indices and write labels
else:
to_return = prediction_dict["raw"]
to_write = prediction_dict["labels"]
# for regressors, it's really simple
else:
to_write = to_return = prediction_dict["raw"]
# check that our predictions to write and return
# are not invalid; this should NEVER happen
try:
assert to_return is not None
assert to_write is not None
except AssertionError:
raise AssertionError("invalid predictions generated")
# write out the predictions if we are asked to
if prediction_prefix is not None:
write_predictions(
example_ids,
to_write,
prediction_prefix,
self.model_type._estimator_type,
self.label_list,
append=append,
)
return to_return
[docs]
def cross_validate(
self,
examples: FeatureSet,
stratified: bool = True,
cv_folds: Union[int, FoldMapping] = 10,
cv_seed: int = 123456789,
grid_search: bool = True,
grid_search_folds: Union[int, FoldMapping] = 5,
grid_jobs: Optional[int] = None,
grid_objective: Optional[str] = None,
output_metrics: List[str] = [],
prediction_prefix: Optional[str] = None,
param_grid: Optional[Dict[str, Any]] = None,
shuffle: bool = False,
save_cv_folds: bool = True,
save_cv_models: bool = False,
use_custom_folds_for_grid_search: bool = True,
) -> CrossValidateTaskResults:
"""
Cross-validate the learner on the given training examples.
Parameters
----------
examples : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to cross-validate learner performance on.
stratified : bool, default=True
Should we stratify the folds to ensure an even
distribution of labels for each fold?
cv_folds : Union[int, :class:`skll.types.FoldMapping`], default=10
The number of folds to use for cross-validation, or
a mapping from example IDs to folds.
cv_seed: int, default=123456789
The value for seeding the random number generator
used to create the random folds. Note that this
seed is *only* used if either ``grid_search`` or
``shuffle`` are set to ``True``.
grid_search : bool, default=True
Should we do grid search when training each fold?
Note: This will make this take *much* longer.
grid_search_folds : Union[int, :class:`skll.types.FoldMapping`], default=5
The number of folds to use when doing the
grid search, or a mapping from example IDs to folds.
grid_jobs : Optional[int], default=None
The number of jobs to run in parallel when doing the
grid search. If ``None`` or 0, the number of
grid search folds will be used.
grid_objective : Optional[str], default=None
The name of the objective function to use when
doing the grid search. Must be specified if
``grid_search`` is ``True``.
output_metrics : List[str], default = []
List of additional metric names to compute in
addition to the metric used for grid search.
prediction_prefix : Optional[str], default=None
If saving the predictions, this is the
prefix that will be used for the filename.
It will be followed by ``"_predictions.tsv"``
param_grid : Optional[Dict[str, Any]], default=None
The parameter grid to search.
shuffle : bool, default=False
Shuffle examples before splitting into folds for CV.
save_cv_folds : bool, default=True
Whether to save the cv fold ids or not?
save_cv_models : bool, default=False
Whether to save the cv models or not?
use_custom_folds_for_grid_search : bool, default=True
If ``cv_folds`` is a custom dictionary, but ``grid_search_folds``
is not, perhaps due to user oversight, should the same custom
dictionary automatically be used for the inner grid-search
cross-validation?
Returns
-------
:class:`skll.types.CrossValidateTaskResults`
A 5-tuple containing the following:
List[:class:`skll.types.EvaluateTaskResults`]: the confusion matrix, overall accuracy,
per-label PRFs, model parameters, objective function score, and
evaluation metrics (if any) for each fold.
List[float]: the grid search scores for each fold.
List[Dict[str, Any]]: list of dictionaries of grid search CV
results, one per fold, with keys such as "params", "mean_test_score",
etc, that are mapped to lists of values associated with each
hyperparameter set combination.
Optional[:class:`skll.types.FoldMapping`]: dictionary containing the test-fold number
for each id if ``save_cv_folds`` is ``True``, otherwise ``None``.
Optional[List[:class:`skll.learner.Learner`]]: list of learners, one for
each fold if ``save_cv_models`` is ``True``, otherwise ``None``.
Raises
------
ValueError
If classification labels are not properly encoded as strings.
ValueError
If ``grid_search`` is ``True`` but ``grid_objective`` is ``None``.
"""
# Seed the random number generator so that randomized
# algorithms are replicable
random_state = np.random.RandomState(cv_seed)
# We need to check whether the labels in the featureset are labels
# or continuous values. If it's the latter, we need to raise an
# an exception since the stratified splitting in sklearn does not
# work with continuous labels. Note that although using random folds
# _will_ work, we want to raise an error in general since it's better
# to encode the labels as strings anyway for classification problems.
if self.model_type._estimator_type == "classifier" and type_of_target(
examples.labels
) not in ["binary", "multiclass"]:
raise ValueError(
"Floating point labels must be encoded as " "strings for cross-validation."
)
# check that we have an objective since grid search is on by default
# Note that `train()` would raise this error anyway later but it's
# better to raise this early on so rather than after a whole bunch of
# stuff has happened
if grid_search:
if not grid_objective:
raise ValueError(
"Grid search is on by default. You must "
"either specify a grid objective or turn off"
" grid search."
)
# Shuffle so that the folds are random for the inner grid search CV.
# If grid search is True but shuffle isn't, shuffle anyway.
# You can't shuffle a scipy sparse matrix in place, so unfortunately
# we make a copy of everything (and then get rid of the old version)
if grid_search or shuffle:
if grid_search and not shuffle:
self.logger.warning(
"Training data will be shuffled to "
"randomize grid search folds. Shuffling "
"may yield different results compared to "
"scikit-learn."
)
ids, labels, features = sk_shuffle(
examples.ids, examples.labels, examples.features, random_state=random_state
)
examples = FeatureSet(
examples.name, ids, labels=labels, features=features, vectorizer=examples.vectorizer
)
# call train setup
self._create_label_dict(examples)
self._train_setup(examples)
# Set up the cross-validation iterator.
kfold, cv_groups = setup_cv_fold_iterator(
cv_folds,
examples,
self.model_type._estimator_type,
stratified=stratified,
logger=self.logger,
)
# When using custom CV folds (a dictionary), if we are planning to do
# grid search, set the grid search folds to be the same as the custom
# cv folds unless a flag is set that explicitly tells us not to.
# Note that this should only happen when we are using the API; otherwise
# the configparser should take care of this even before this method is called
if isinstance(cv_folds, dict):
if grid_search and use_custom_folds_for_grid_search and grid_search_folds != cv_folds:
self.logger.warning(
"The specified custom folds will be used " "for the inner grid search."
)
grid_search_folds = cv_folds
# handle each fold separately & accumulate the predictions and results
results = []
grid_search_scores = []
grid_search_cv_results_dicts = []
append_predictions = False
models: Optional[List["Learner"]] = [] if save_cv_models else None
skll_fold_ids: Optional[FoldMapping] = {} if save_cv_folds else None
assert examples.labels is not None
for fold_num, (train_indices, test_indices) in enumerate(
kfold.split(examples.features, examples.labels, cv_groups)
):
# Train model
assert examples.labels is not None and examples.features is not None
self._model = None # prevent feature vectorizer from being reset.
train_set = FeatureSet(
examples.name,
examples.ids[train_indices],
labels=examples.labels[train_indices],
features=examples.features[train_indices],
vectorizer=examples.vectorizer,
)
(grid_search_score, grid_search_cv_results) = self.train(
train_set,
grid_search_folds=grid_search_folds,
grid_search=grid_search,
grid_objective=grid_objective,
param_grid=param_grid,
grid_jobs=grid_jobs,
shuffle=grid_search,
)
grid_search_scores.append(grid_search_score)
if save_cv_models and models is not None:
models.append(copy.deepcopy(self))
grid_search_cv_results_dicts.append(grid_search_cv_results)
# note: there is no need to shuffle again within each fold,
# regardless of what the shuffle keyword argument is set to.
# Evaluate model
test_tuple = FeatureSet(
examples.name,
examples.ids[test_indices],
labels=examples.labels[test_indices],
features=examples.features[test_indices],
vectorizer=examples.vectorizer,
)
results.append(
self.evaluate(
test_tuple,
prediction_prefix=prediction_prefix,
append=append_predictions,
grid_objective=grid_objective,
output_metrics=output_metrics,
)
)
append_predictions = True
# save the fold number for each test ID if we were asked to
if save_cv_folds and skll_fold_ids is not None:
for index in test_indices:
skll_fold_ids[examples.ids[index]] = str(fold_num)
# return list of results/outputs for all folds
return (results, grid_search_scores, grid_search_cv_results_dicts, skll_fold_ids, models)
[docs]
def learning_curve(
self,
examples: FeatureSet,
metric: str,
cv_folds: Union[int, FoldMapping] = 10,
train_sizes: LearningCurveSizes = np.linspace(0.1, 1.0, 5),
override_minimum: bool = False,
) -> Tuple[List[float], List[float], List[float], List[int]]:
"""
Generate learning curves for the learner using the examples.
The learning curves are generated on the training examples
via cross-validation. Adapted from the scikit-learn code for learning
curve generation (cf.``sklearn.model_selection.learning_curve``).
Parameters
----------
examples : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to generate the learning curve on.
cv_folds : Union[int, :class:`skll.types.FoldMapping`], default=10
The number of folds to use for cross-validation, or a mapping from
example IDs to folds.
metric : str
The name of the metric function to use when computing the train
and test scores for the learning curve.
train_sizes : :class:`skll.types.LearningCurveSizes`, default= :func:`numpy.linspace` with start=0.1, stop=1.0, num=5
Relative or absolute numbers of training examples
that will be used to generate the learning curve.
If the type is float, it is regarded as a fraction
of the maximum size of the training set (that is
determined by the selected validation method),
i.e. it has to be within (0, 1]. Otherwise it
is interpreted as absolute sizes of the training
sets. Note that for classification the number of
samples usually have to be big enough to contain
at least one sample from each class.
override_minimum : bool, default=False
Learning curves can be unreliable for very small sizes
esp. for > 2 labels. If this option is set to ``True``, the
learning curve would be generated even if the number
of example is less 500 along with a warning. If ``False``,
the curve is not generated and an exception is raised instead.
Returns
-------
train_scores : List[float]
The scores for the training set.
test_scores : List[float]
The scores on the test set.
fit_times : List[float]
The average times taken to fit each model.
num_examples : List[int]
The numbers of training examples used to generate the curve.
Raises
------
ValueError
If the number of examples is less than 500.
"""
# check that the number of training examples is more than the minimum
# needed for generating a reliable learning curve
if len(examples) < 500:
if not override_minimum:
raise ValueError(
f"Number of training examples provided ({len(examples)}) "
"is less than the minimum needed (500) for the "
"learning curve to be reliable."
)
else:
self.logger.warning(
"Learning curves can be unreliable for examples fewer than "
f"500. You provided {len(examples)}."
)
# raise a warning if we are using a probabilistic classifier
# since that means we cannot use the predictions directly
if self.probability:
self.logger.warning(
"Since ``probability`` is set, the most likely "
"class will be computed via an argmax before "
"computing the curve."
)
# Call train setup before since we need to train
# the learner eventually
self._create_label_dict(examples)
self._train_setup(examples)
# set up the CV split iterator over the train/test featuresets
# which also returns the maximum number of training examples
(featureset_iter, n_max_training_samples) = setup_cv_split_iterator(cv_folds, examples)
# Get the `_translate_train_sizes()` function from scikit-learn
# since we need it to get the right list of sizes after cross-validation
_module = import_module("sklearn.model_selection._validation")
_translate_train_sizes = getattr(_module, "_translate_train_sizes")
train_sizes_abs = _translate_train_sizes(train_sizes, n_max_training_samples)
n_unique_ticks = train_sizes_abs.shape[0]
# Limit the number of parallel jobs for this
# to be no higher than five or the number of cores
# for the machine, whichever is lower
n_jobs = min(cpu_count(), MAX_CONCURRENT_PROCESSES)
# Run jobs in parallel that train the model on each subset
# of the training data and compute train and test scores
parallel = joblib.Parallel(n_jobs=n_jobs, pre_dispatch=n_jobs)
out = parallel(
joblib.delayed(train_and_score)(self, train_fs[:n_train_samples], test_fs, metric)
for train_fs, test_fs in featureset_iter
for n_train_samples in train_sizes_abs
)
# Reshape the outputs
out = np.array(out)
n_cv_folds = out.shape[0] // n_unique_ticks
out = out.reshape(n_cv_folds, n_unique_ticks, 3)
out = np.asarray(out).transpose((2, 1, 0))
return list(out[0]), list(out[1]), list(out[2]), list(train_sizes_abs)
# Rescaled regressors
@rescaled
class RescaledBayesianRidge(BayesianRidge): # noqa: D101
pass
@rescaled
class RescaledAdaBoostRegressor(AdaBoostRegressor): # noqa: D101
pass
@rescaled
class RescaledDecisionTreeRegressor(DecisionTreeRegressor): # noqa: D101
pass
@rescaled
class RescaledElasticNet(ElasticNet): # noqa: D101
pass
@rescaled
class RescaledGradientBoostingRegressor(GradientBoostingRegressor): # noqa: D101
pass
@rescaled
class RescaledHuberRegressor(HuberRegressor): # noqa: D101
pass
@rescaled
class RescaledKNeighborsRegressor(KNeighborsRegressor): # noqa: D101
pass
@rescaled
class RescaledLars(Lars): # noqa: D101
pass
@rescaled
class RescaledLasso(Lasso): # noqa: D101
pass
@rescaled
class RescaledLinearRegression(LinearRegression): # noqa: D101
pass
@rescaled
class RescaledLinearSVR(LinearSVR): # noqa: D101
pass
@rescaled
class RescaledMLPRegressor(MLPRegressor): # noqa: D101
pass
@rescaled
class RescaledRandomForestRegressor(RandomForestRegressor): # noqa: D101
pass
@rescaled
class RescaledRANSACRegressor(RANSACRegressor): # noqa: D101
pass
@rescaled
class RescaledRidge(Ridge): # noqa: D101
pass
@rescaled
class RescaledSGDRegressor(SGDRegressor): # noqa: D101
pass
@rescaled
class RescaledSVR(SVR): # noqa: D101
pass
@rescaled
class RescaledTheilSenRegressor(TheilSenRegressor): # noqa: D101
pass
```