Source code for skll.data.featureset

# License: BSD 3 clause
"""
Classes related to storing/merging feature sets.

:author: Dan Blanchard (dblanchard@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:author: Jeremy Biggs (jbiggs@ets.org)
:organization: ETS
"""

from copy import deepcopy
from typing import Collection, List, Optional, Tuple, Union

import numpy as np
import scipy.sparse as sp
from pandas import DataFrame
from sklearn.feature_extraction import DictVectorizer, FeatureHasher

from skll.data.dict_vectorizer import DictVectorizer as SkllDictVectorizer
from skll.types import FeatGenerator, FeatureDictList, IdType, LabelType, SparseFeatureMatrix


[docs] class FeatureSet(object): """ Encapsulate features, labels, and metadata for a given dataset. Parameters ---------- name : str The name of this feature set. ids : Union[List[str], numpy.ndarray] Example IDs for this set. labels : Optional[Union[List[str], numpy.ndarray], default=None Labels for this set. features : Optional[Union[:class:`skll.types.FeatureDictList`, :class:`numpy.ndarray`]], default=None The features for each instance represented as either a list of dictionaries or a numpy array (if ``vectorizer`` is also specified). vectorizer : Optional[Union[:class:`sklearn.feature_extraction.DictVectorizer`, :class:`sklearn.feature_extraction.FeatureHasher`], default=None Vectorizer which will be used to generate the feature matrix. Warnings -------- FeatureSets can only be equal if the order of the instances is identical because these are stored as lists/arrays. Since scikit-learn's ``DictVectorizer`` automatically sorts the underlying feature matrix if it is sparse, we do not do any sorting before checking for equality. This is not a problem because we _always_ use sparse matrices with ``DictVectorizer`` when creating FeatureSets. Notes ----- If ids, labels, and/or features are not None, the number of rows in each array must be equal. """ def __init__( self, name: str, ids: Union[List[str], np.ndarray], labels: Optional[Union[List[str], np.ndarray]] = None, features: Optional[Union[FeatureDictList, SparseFeatureMatrix]] = None, vectorizer: Optional[Union[DictVectorizer, FeatureHasher]] = None, ): """Initialize a FeatureSet instance.""" super(FeatureSet, self).__init__() # clearly define the attribute types self.ids: np.ndarray self.labels: Optional[np.ndarray] self.features: Optional[SparseFeatureMatrix] self.vectorizer: Optional[Union[DictVectorizer, FeatureHasher]] self.name = name if isinstance(ids, list): self.ids = np.array(ids) elif isinstance(ids, np.ndarray): self.ids = ids else: raise ValueError("Ids must be a list or numpy array.") if isinstance(labels, list): labels = np.array(labels) self.labels = labels self.vectorizer = vectorizer # convert features from list of dictionaries to sparse array, if needed if isinstance(features, list): if self.vectorizer is None: self.vectorizer = SkllDictVectorizer(sparse=True) features_array: SparseFeatureMatrix = self.vectorizer.fit_transform(features) self.features = features_array else: self.features = features if self.features is not None: num_feats = self.features.shape[0] num_ids = self.ids.shape[0] if num_feats != num_ids: raise ValueError( f"Number of IDs ({num_ids}) does not equal " f"number of feature rows ({num_feats})" ) if self.labels is None: self.labels = np.empty(num_feats) self.labels.fill(None) num_labels = self.labels.shape[0] if num_feats != num_labels: raise ValueError( f"Number of labels ({num_labels}) does not " f"equal number of feature rows ({num_feats})" ) def __contains__(self, value): """ Check if example ID is in the FeatureSet. Parameters ---------- value The value to check. """ return value in self.ids def __eq__(self, other): """ Check whether two featuresets are the same. Parameters ---------- other : :class:`skll.data.featureset.FeatureSet` The other ``FeatureSet`` to check equivalence with. Returns ------- bool ``True`` if they are the same, ``False`` otherwise. Notes ----- We consider feature values to be equal if any differences are in the sixth decimal place or higher. """ return ( self.ids.shape == other.ids.shape and self.labels.shape == other.labels.shape and self.features.shape == other.features.shape and (self.ids == other.ids).all() and (self.labels == other.labels).all() and np.allclose(self.features.data, other.features.data, rtol=1e-6) and (self.features.indices == other.features.indices).all() and (self.features.indptr == other.features.indptr).all() and self.vectorizer == other.vectorizer ) def __iter__(self): """Iterate through (ID, label, feature_dict) tuples in feature set.""" if self.features is not None: if not isinstance(self.vectorizer, DictVectorizer): raise ValueError( "FeatureSets can only be iterated through if " "they use a DictVectorizer for their feature " "vectorizer." ) for id_, label_, feats in zip(self.ids, self.labels, self.features): # reshape to a 2D matrix if we are not using a sparse matrix # to store the features feats = feats.reshape(1, -1) if not sp.issparse(feats) else feats # When calling inverse_transform we have to add [0] to get the # results for the current instance because it always returns a # 2D array yield (id_, label_, self.vectorizer.inverse_transform(feats)[0]) else: return def __len__(self) -> int: """Return number of rows in the ``FeatureSet`` instance.""" return self.features.shape[0] if self.features is not None else 0 def __add__(self, other: "FeatureSet") -> "FeatureSet": """ Combine two feature sets to create a new one. The combination is done assuming they both have the same instances with the same IDs in the same order. Parameters ---------- other : :class:`skll.data.featureset.FeatureSet` The other ``FeatureSet`` to add to this one. Returns ------- :class:`skll.data.featureset.FeatureSet The combined feature set. Raises ------ ValueError If IDs are not in the same order in each ``FeatureSet`` instance. ValueError If either the 'features' or 'vectorizer' attributes are ``None`` for either of the two ``FeatureSet`` instances. ValueError If vectorizers are different between the two ``FeatureSet`` instances. ValueError If there are duplicate feature names. ValueError If there are conflicting labels. """ # Check that the sets of IDs are equal if set(self.ids) != set(other.ids): raise ValueError("IDs are not in the same order in each " "feature set") # Compute the relative ordering of IDs for merging the features # and labels. ids_indices = dict((y, x) for x, y in enumerate(other.ids)) relative_order = [ids_indices[self_id] for self_id in self.ids] # Initialize the new feature set with a name and the IDs. new_set = FeatureSet("+".join(sorted([self.name, other.name])), deepcopy(self.ids)) # Make sure that features and vectorizer in either feature set are not None if ( self.vectorizer is None or other.vectorizer is None or self.features is None or other.features is None ): raise ValueError( "Cannot combine FeatureSets since either the vectorizer " "or the features are not defined." ) # Make sure the two vectorizers are the same type if not isinstance(self.vectorizer, type(other.vectorizer)): raise ValueError( "Cannot combine FeatureSets because they are " "not both using the same type of feature " "vectorizer (e.g., DictVectorizer, " "FeatureHasher)" ) # they have to be the same types in this block else: uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher) if uses_feature_hasher: if self.vectorizer.n_features != other.vectorizer.n_features: raise ValueError( "Cannot combine FeatureSets that use " "FeatureHashers with different values of " "n_features setting." ) else: # Check for duplicate feature names. if set(self.vectorizer.feature_names_) & set(other.vectorizer.feature_names_): raise ValueError( "Cannot combine FeatureSets because they have duplicate feature names." ) num_feats = self.features.shape[1] new_set.features = sp.hstack([self.features, other.features[relative_order]], "csr") new_set.vectorizer = deepcopy(self.vectorizer) if not uses_feature_hasher: for feat_name, index in other.vectorizer.vocabulary_.items(): new_set.vectorizer.vocabulary_[feat_name] = index + num_feats other_names = other.vectorizer.feature_names_ new_set.vectorizer.feature_names_.extend(other_names) # If either set has labels, check that they don't conflict. if self.has_labels: # labels should be the same for each FeatureSet, so store once. conflicts = not np.all(self.labels == other.labels[relative_order]) # type: ignore if other.has_labels and conflicts: raise ValueError( "Feature sets have conflicting labels for examples with the same ID." ) new_set.labels = deepcopy(self.labels) else: labels = other.labels if other.has_labels: labels = deepcopy(other.labels[relative_order]) # type: ignore new_set.labels = labels return new_set
[docs] def filter( self, ids: Optional[List[IdType]] = None, labels: Optional[List[LabelType]] = None, features: Optional[List[str]] = None, inverse: bool = False, ) -> None: """ Remove or keep features and/or examples from the given feature set. Filtering is done in-place. Parameters ---------- ids : Optional[List[:class:`skll.types.IdType`]], default=None Examples to keep in the FeatureSet. If ``None``, no ID filtering takes place. labels : Optional[List[:class:`skll.types.LabelType`]], default=None Labels that we want to retain examples for. If ``None``, no label filtering takes place. features : Optional[List[str]], default=None Features to keep in the FeatureSet. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the FeatureSet that contain a ``=`` will be split on the first occurrence and the prefix will be checked to see if it is in ``features``. If ``None``, no feature filtering takes place. Cannot be used if FeatureSet uses a FeatureHasher for vectorization. inverse : bool, default=False Instead of keeping features and/or examples in lists, remove them. Raises ------ ValueError If attempting to use features to filter a ``FeatureSet`` that uses a ``FeatureHasher`` vectorizer. """ # Construct mask that indicates which examples to keep mask = np.ones(len(self), dtype=bool) if ids: mask = np.logical_and(mask, np.in1d(self.ids, ids)) if labels and self.labels is not None: mask = np.logical_and(mask, np.in1d(self.labels, labels)) if inverse and (labels is not None or ids is not None): mask = np.logical_not(mask) # Remove examples not in mask self.ids = self.ids[mask] if self.labels is not None: self.labels = self.labels[mask] if self.features is not None: self.features = self.features[mask, :] # Filter features if features and self.features is not None and self.vectorizer is not None: if isinstance(self.vectorizer, FeatureHasher): raise ValueError( "FeatureSets with FeatureHasher vectorizers cannot be filtered by feature." ) columns = np.array( sorted( { feat_num for feat_name, feat_num in self.vectorizer.vocabulary_.items() if (feat_name in features or feat_name.split("=", 1)[0] in features) } ) ) if inverse: all_columns = np.arange(self.features.shape[1]) columns = all_columns[np.logical_not(np.in1d(all_columns, columns))] self.features = self.features[:, columns] self.vectorizer.restrict(columns, indices=True)
[docs] def filtered_iter( self, ids: Optional[List[IdType]] = None, labels: Optional[List[LabelType]] = None, features: Optional[Collection[str]] = None, inverse: bool = False, ) -> FeatGenerator: """ Retain only the specified features and/or examples from the output. Parameters ---------- ids : Optional[List[:class:`skll.types.IdType`]], default=None Examples to keep in the ``FeatureSet``. If ``None``, no ID filtering takes place. labels : Optional[List[:class:`skll.types.LabelType`]], default=None Labels that we want to retain examples for. If ``None``, no label filtering takes place. features : Optional[Collection[str]], default=None Features to keep in the ``FeatureSet``. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the ``FeatureSet`` that contain a `=` will be split on the first occurrence and the prefix will be checked to see if it is in ``features``. If `None`, no feature filtering takes place. Cannot be used if ``FeatureSet`` uses a FeatureHasher for vectorization. inverse : bool, default=False Instead of keeping features and/or examples in lists, remove them. Returns ------- :class:`skll.types.FeatGenerator` A generator that yields 3-tuples containing: - :class:`skll.types.IdType` - The ID of the example. - :class:`skll.types.LabelType` - The label of the example. - :class:`skll.types.FeatureDict` - The feature dictionary, with feature name as the key and example value as the value. Raises ------ ValueError If the vectorizer is not a ``DictVectorizer``. ValueError If any of the "labels", "features", or "vectorizer" attribute is ``None``. """ if self.features is not None and not isinstance(self.vectorizer, DictVectorizer): raise ValueError( "FeatureSets can only be iterated through if they" " use a DictVectorizer for their feature " "vectorizer." ) if self.labels is None or self.features is None or self.vectorizer is None: raise ValueError("Cannot filter featureset with no labels, features, or vectorizer.") else: for id_, label_, feats in zip(self.ids, self.labels, self.features): # Skip instances with IDs not in filter if ids is not None and (id_ in ids) == inverse: continue # Skip instances with labels not in filter if labels is not None and (label_ in labels) == inverse: continue # reshape to a 2D matrix if we are not using a sparse matrix # to store the features feats = feats.reshape(1, -1) if not sp.issparse(feats) else feats feat_dict = self.vectorizer.inverse_transform(feats)[0] if features is not None: feat_dict = { name: value for name, value in feat_dict.items() if (inverse != (name in features or name.split("=", 1)[0] in features)) } elif not inverse: feat_dict = {} yield id_, label_, feat_dict
def __sub__(self, other: "FeatureSet") -> "FeatureSet": """ Subset ``FeatureSet`` instance by removing all features from ``other`` instance. Parameters ---------- other : :class:`skll.data.featureset.FeatureSet` The other ``FeatureSet`` containing the features that should be removed from this ``FeatureSet``. Returns ------- :class:`skll.data.featureset.FeatureSet` A copy of ``self`` with all features in ``other`` removed. """ new_set = deepcopy(self) if other.vectorizer: new_set.filter(features=other.vectorizer.feature_names_, inverse=True) return new_set @property def has_labels(self): """ Check if ``FeatureSet`` has finite labels. Returns ------- has_labels : bool Whether or not this FeatureSet has any finite labels. """ # make sure that labels is not None or a list of Nones if self.labels is not None and not all(label is None for label in self.labels): # then check that they are not a list of NaNs return not ( np.issubdtype(self.labels.dtype, np.floating) and np.isnan(np.min(self.labels)) ) else: return False def __str__(self): """ Return a string representation of ``FeatureSet``. Returns ------- str: A string representation of ``FeatureSet``. """ return str(self.__dict__) def __repr__(self): """ Return a string representation of ``FeatureSet``. Returns ------- str: A string representation of ``FeatureSet``. """ return repr(self.__dict__) def __getitem__( self, value: Union[int, slice] ) -> Union["FeatureSet", Tuple[IdType, LabelType, FeatureDictList]]: """ Get new feature subset or specific example. Parameters ---------- value: Union[int, slice] The value to use for retrieval. This can either be a slice or an index. Returns ------- Union[:class:`skll.data.featureset.FeatureSet`, Tuple[:class:`skll.types.IdType`, :class:`skll.types.LabelType`, :class:`skll.types.FeatureDictList`]] # noqa: E501 If `value` is a slice, then return a new ``FeatureSet`` instance containing a subset of the data. If it's an index, return the specific example by row number. """ # Check if we're slicing if isinstance(value, slice): sliced_ids = self.ids[value] sliced_feats = self.features[value] if self.features is not None else None sliced_labels = self.labels[value] if self.labels is not None else None return FeatureSet( f"{self.name}_{value}", sliced_ids, features=sliced_feats, labels=sliced_labels, vectorizer=self.vectorizer, ) else: label = self.labels[value] if self.labels is not None else "" if self.features is not None and self.vectorizer: submatrix = self.features[value, :] features = self.vectorizer.inverse_transform(submatrix)[0] else: features = [{}] return self.ids[value], label, features
[docs] @staticmethod def split( fs: "FeatureSet", ids_for_split1: List[int], ids_for_split2: Optional[List[int]] = None ) -> Tuple["FeatureSet", "FeatureSet"]: """ Split ``FeatureSet`` into two new ``FeatureSet`` instances. The splitting is done based on the given indices for the two splits. Parameters ---------- fs : skll.data.featureset.FeatureSet The ``FeatureSet`` instance to split. ids_for_split1 : List[int] A list of example indices which will be split out into the first ``FeatureSet`` instance. Note that the FeatureSet instance will respect the order of the specified indices. ids_for_split2 : Optional[List[int]], default=None An optional list of example indices which will be split out into the second ``FeatureSet`` instance. Note that the ``FeatureSet`` instance will respect the order of the specified indices. If this is not specified, then the second ``FeatureSet`` instance will contain the complement of the first set of indices sorted in ascending order. Returns ------- Tuple[:class:`skll.data.featureset.FeatureSet`, :class:`skll.data.featureset.FeatureSet`] A tuple containing the two featureset instances. """ # Note: an alternative way to implement this is to make copies # of the given FeatureSet instance and then use the `filter()` # method but that wastes too much memory since it requires making # two copies of the original FeatureSet which may be huge. With # the current implementation, we are creating new objects but # they should be much smaller than the original FeatureSet. ids1 = fs.ids[ids_for_split1] labels1 = fs.labels[ids_for_split1] if fs.labels is not None else None features1 = fs.features[ids_for_split1] if fs.features is not None else None # if ids_for_split2 is not given, it will be the complement of ids_split1 if ids_for_split2 is None: ids_for_split2 = [ind for ind in range(len(fs.ids)) if ind not in ids_for_split1] ids2 = fs.ids[ids_for_split2] labels2 = fs.labels[ids_for_split2] if fs.labels is not None else None features2 = fs.features[ids_for_split2] if fs.features is not None else None fs1 = FeatureSet( f"{fs.name}_1", ids1, labels=labels1, features=features1, vectorizer=fs.vectorizer ) fs2 = FeatureSet( f"{fs.name}_2", ids2, labels=labels2, features=features2, vectorizer=fs.vectorizer ) return fs1, fs2
[docs] @staticmethod def from_data_frame( df: DataFrame, name: str, labels_column: Optional[str] = None, vectorizer: Optional[Union[DictVectorizer, FeatureHasher]] = None, ) -> "FeatureSet": """ Create a ``FeatureSet`` instance from a pandas data frame. Will raise an Exception if pandas is not installed in your environment. The ``ids`` in the ``FeatureSet`` will be the index from the given frame. Parameters ---------- df : pandas.DataFrame The pandas.DataFrame object to use as a ``FeatureSet``. name : str The name of the output ``FeatureSet`` instance. labels_column : Optional[str], default=None The name of the column containing the labels (data to predict). vectorizer : Optional[Union[:class:`sklearn.feature_extraction.DictVectorizer`, :class:`sklearn.feature_extraction.FeatureHasher`]], default=None Vectorizer which will be used to generate the feature matrix. Returns ------- :class:`skll.data.featureset.FeatureSet` A ``FeatureSet`` instance generated from from the given data frame. """ if labels_column: feature_columns = [column for column in df.columns if column != labels_column] labels = df[labels_column].tolist() else: feature_columns = df.columns labels = None features = df[feature_columns].to_dict(orient="records") return FeatureSet( name, ids=df.index.tolist(), labels=labels, features=features, vectorizer=vectorizer )