# License: BSD 3 clause
"""
Handles loading data from various types of data files.
:author: Dan Blanchard (dblanchard@ets.org)
:author: Michael Heilman (mheilman@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:author: Jeremy Biggs (jbiggs@ets.org)
:organization: ETS
"""
import json
import logging
import sys
from csv import DictWriter
from decimal import Decimal
from pathlib import Path
from typing import IO, Any, Dict, List, Optional, Set, Tuple, Union
import numpy as np
import pandas as pd
from scipy.sparse import issparse
from sklearn.feature_extraction import DictVectorizer, FeatureHasher
from skll.data import FeatureSet
from skll.types import FeatGenerator, FeatureDict, IdType, LabelType, PathOrStr
[docs]
class Writer(object):
"""
Write out FeatureSets to files on disk.
This is the base class used to create featureset writers for different
file types.
Parameters
----------
path : :class:`skll.types.PathOrStr`
A path to the feature file we would like to create. The suffix
to this filename must be ``.arff``, ``.csv``, ``.jsonlines``,
``.libsvm``, ``.ndj``, or ``.tsv``. If ``subsets``
is not ``None``, when calling the ``write()`` method, path is
assumed to be a string containing the path to the directory to
write the feature files with an additional file extension
specifying the file type. For example ``/foo/.csv``.
feature_set : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to dump to the file.
quiet : bool, default=True
Do not print "Writing..." status message to stderr.
subsets : Optional[Dict[str, List[str]]], default=None
A mapping from subset names to lists of feature names
that are included in those sets. If given, a feature
file will be written for every subset (with the name
containing the subset name as suffix to ``path``).
Note, since string- valued features are automatically
converted into boolean features with names of the form
``FEATURE_NAME=STRING_VALUE``, when doing the
filtering, the portion before the ``=`` is all that's
used for matching. Therefore, you do not need to
enumerate all of these boolean feature names in your
mapping.
logger : Optional[logging.Logger], default=None
A logger instance to use to log messages instead of creating
a new one by default.
"""
def __init__(
self,
path: PathOrStr,
feature_set: FeatureSet,
quiet: bool = True,
subsets: Optional[Dict[str, List[str]]] = None,
logger: Optional[logging.Logger] = None,
):
"""Initialize base Writer class."""
super(Writer, self).__init__()
self.quiet = quiet
self.path = Path(path)
self.feat_set = feature_set
self.subsets = subsets
self.logger = logger if logger else logging.getLogger(__name__)
# Get prefix & extension for checking file types & writing subset files;
# since we also need to handle paths like "foo/.csv" for subset-writing
# we have to do a bit of introspection before figuring out the various
# parts of the path
parent, stem, suffix = self.path.parent, self.path.stem, self.path.suffix
if stem.startswith(".") and not suffix:
self.root = parent
self.ext = stem.lower()
else:
self.root = parent / stem
self.ext = suffix.lower()
self._progress_msg = ""
self._use_pandas = False
[docs]
@classmethod
def for_path(cls, path: PathOrStr, feature_set: FeatureSet, **kwargs) -> "Writer":
"""
Retrieve object of ``Writer`` sub-class appropriate for given path.
Parameters
----------
path : :class:`skll.types.PathOrStr`
A path to the feature file we would like to create. The
suffix to this filename must be ``.arff``, ``.csv``,
``.jsonlines``, ``.libsvm``, ``.ndj``, or
``.tsv``. If ``subsets`` is not ``None``, when calling the
``write()`` method, path is assumed to be a string
containing the path to the directory to write the feature
files with an additional file extension specifying the
file type. For example ``/foo/.csv``.
feature_set : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to dump to the output file.
kwargs : Optional[Dict[str, Any]]
The keyword arguments for ``for_path`` are the same as
the initializer for the desired ``Writer`` subclass.
Returns
-------
writer : :class:`skll.data.Writer`
New instance of the Writer sub-class that is
appropriate for the given path.
"""
# Get lowercase extension for file extension checking
# NOTE: the reason we are doing this complicated gymnastics
# instead of just using `path.suffix` is because sometimes
# `path` may look like `foo/.jsonlines` when we are writing
# subsets so we need to handle that edge case
path = Path(path)
stem, suffix = path.stem, path.suffix
if stem.startswith(".") and not suffix:
ext = stem.lower()
else:
ext = suffix.lower()
return EXT_TO_WRITER[ext](path, feature_set, **kwargs)
[docs]
def write(self) -> None:
"""Write out this Writer's ``FeatureSet`` to a file in its format."""
if isinstance(self.feat_set.vectorizer, FeatureHasher):
raise ValueError(
"Writer cannot write sets that use a " "FeatureHasher for vectorization."
)
# Write one feature file if we weren't given a dict of subsets
if self.subsets is None:
self._write_subset(self.path)
# Otherwise write one feature file per subset
else:
for subset_name, filter_features in self.subsets.items():
self.logger.debug(f"Subset ({subset_name}) features: " f"{filter_features}")
sub_path = self.root / f"{subset_name}{self.ext}"
self._write_subset(sub_path, set(filter_features))
def _write_subset(
self, sub_path: PathOrStr, filter_features: Optional[Set[str]] = None
) -> None:
"""
Write out given ``FeatureSet`` instance to a file in this class's format.
Parameters
----------
sub_path : :class:`skll.types.PathOrStr`
The path to the file we want to create for this subset
of our data.
filter_features : Optional[Set[str]], default=None
Set of features to include in current feature file.
"""
self.logger.debug(f"sub_path: {sub_path}")
self.logger.debug(f"feature_set: {self.feat_set.name}")
self.logger.debug(f"filter_features: {filter_features}")
if not self.quiet:
self._progress_msg = f"Writing {sub_path}..."
print(self._progress_msg, end="\r", file=sys.stderr)
sys.stderr.flush()
if not self._use_pandas:
# Apply filtering
filtered_set: Union[FeatGenerator, FeatureSet] = (
self.feat_set.filtered_iter(features=filter_features)
if filter_features is not None
else self.feat_set
)
# Open file for writing and write each line
with open(sub_path, "w", encoding="utf-8") as output_file:
# Write out the header if this format requires it
self._write_header(filtered_set, output_file, filter_features)
# Write individual lines
for ex_num, (id_, label_, feat_dict) in enumerate(filtered_set):
self._write_line(id_, label_, feat_dict, output_file)
if not self.quiet and ex_num % 100 == 0:
print(f"{self._progress_msg}{ex_num:>15}", end="\r", file=sys.stderr)
sys.stderr.flush()
else:
self._write_data(self.feat_set, sub_path, filter_features)
if not self.quiet:
print(f"{self._progress_msg}{'done':<15}", file=sys.stderr)
sys.stderr.flush()
def _write_header(self, feature_set, output_file, filter_features):
"""
Write header to file.
Called before lines are written to file, so that headers can be written
for files that need them.
Parameters
----------
feature_set : Ignored
Not used.
output_file : Ignored
Not used.
filter_features : Ignored
Not used.
"""
pass
def _write_line(self, id_, label_, feat_dict, output_file):
"""
Write the current line in the file in this Writer's format.
Parameters
----------
id_ : Ignored
Not used.
label_ : Ignored
Not used.
feat_dict : Ignored
Not used.
output_file : Ignored
Not used.
Raises
------
NotImplementedError
"""
raise NotImplementedError
def _write_data(self, feature_set, output_file, filter_features):
"""
Write full data set in Writer's format using `pandas`, rather than row-by-row.
Parameters
----------
feature_set : Ignored
Not used.
output_file : Ignored
Not used.
filter_features : Ignored
Not used.
Raises
------
NotImplementedError
"""
raise NotImplementedError
def _get_column_names_and_indexes(
self, feature_set: FeatureSet, filter_features: Optional[Set[str]] = None
) -> Tuple[List[str], List[int]]:
"""
Get names of columns and associated indices for (possibly filtered) features.
Parameters
----------
feature_set : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance being written to a file.
filter_features : Optional[Set[str]], default=None
If only writing a subset of the features in the
FeatureSet to ``output_file``, these are the
features to include in this file.
Returns
-------
column_names : List[str]
A list of the (possibly filtered) column names.
column_indexes : List[int]
A list of the (possibly filtered) column indexes.
"""
# if we're not doing filtering,
# then just take all the feature names
self.logger.debug(feature_set)
if isinstance(feature_set.vectorizer, DictVectorizer):
if filter_features is None:
filter_features = feature_set.vectorizer.feature_names_
# create a list of tuples with (column names, column indexes)
# so that we can correctly extract the appropriate columns
columns = sorted(
[
(col_name, col_idx)
for col_name, col_idx in feature_set.vectorizer.vocabulary_.items()
if (col_name in filter_features or col_name.split("=", 1)[0] in filter_features)
],
key=lambda x: x[1],
)
# then, split the names and indexes into separate lists
column_names, column_indexes = zip(*columns)
return list(column_names), list(column_indexes)
else:
return [], []
[docs]
class CSVWriter(Writer):
"""
Writer for writing out ``FeatureSet`` instances as CSV files.
Parameters
----------
path : :class:`skll.types.PathOrStr`
A path to the feature file we would like to create.
If ``subsets`` is not ``None``, this is assumed to be a string
containing the path to the directory to write the feature
files with an additional file extension specifying the file
type. For example ``/foo/.csv``.
feature_set : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to dump to the output file.
quiet : bool, default=True
Do not print "Writing..." status message to stderr.
subsets : Optional[Dict[str, List[str]]], default=None
A mapping from subset names to lists of feature names
that are included in those sets. If given, a feature
file will be written for every subset (with the name
containing the subset name as suffix to ``path``).
Note, since string- valued features are automatically
converted into boolean features with names of the form
``FEATURE_NAME=STRING_VALUE``, when doing the
filtering, the portion before the ``=`` is all that's
used for matching. Therefore, you do not need to
enumerate all of these boolean feature names in your
mapping.
logger : Optional[logging.Logger], default=None
A logger instance to use to log messages instead of creating
a new one by default.
label_col : str, default="y"
The column name containing the label.
id_col : str, default="id"
The column name containing the ID.
pandas_kwargs : Optional[Dict[str], Any], default=None
Arguments that will be passed directly to the `pandas` I/O reader.
"""
def __init__(
self,
path: PathOrStr,
feature_set: FeatureSet,
quiet: bool = True,
subsets: Optional[Dict[str, List[str]]] = None,
logger: Optional[logging.Logger] = None,
label_col: str = "y",
id_col: str = "id",
pandas_kwargs: Optional[Dict[str, Any]] = None,
):
"""Initialize the CSVWriter class."""
self.label_col = label_col
self.id_col = id_col
self._pandas_kwargs = {} if pandas_kwargs is None else pandas_kwargs
self._sep = self._pandas_kwargs.pop("sep", ",")
self._index = self._pandas_kwargs.pop("index", False)
super(CSVWriter, self).__init__(
path, feature_set, quiet=quiet, subsets=subsets, logger=logger
)
self._use_pandas = True
def _build_dataframe_with_features(
self, feature_set: FeatureSet, filter_features: Optional[Set[str]] = None
) -> pd.DataFrame:
"""
Create and filter data frame from features in given feature set.
Parameters
----------
feature_set : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance being written to a file.
filter_features : Optional[Set[str]], default=None
If only writing a subset of the features in the
FeatureSet to ``output_file``, these are the
features to include in this file.
Returns
-------
df_features : pandas.DataFrame
The data frame constructed from the feature set. The frame may be
empty are not features in the feature set.
Raises
------
ValueError
If ID column is already used as feature.
If label column is already used as feature.
"""
# if there is no filtering, then just keep all the names
(column_names, column_idxs) = self._get_column_names_and_indexes(
feature_set, filter_features
)
# create the data frame from the feature set;
# then, select only the columns that we want,
# and give the columns their correct names
if feature_set.features is not None:
if issparse(feature_set.features):
df_features = pd.DataFrame(feature_set.features.toarray())
else:
df_features = pd.DataFrame(feature_set.features)
df_features = df_features.iloc[:, column_idxs].copy()
df_features.columns = column_names
return df_features
else:
return pd.DataFrame()
def _build_dataframe(
self,
feature_set: FeatureSet,
filter_features: Optional[Set[str]] = None,
df_features: Optional[pd.DataFrame] = None,
) -> pd.DataFrame:
"""
Create and filter data frame with features in given feature set.
Add the IDs and labels, if applicable. If the data frame
with features already exists, pass `df_features`. Then the IDs and labels will
simply be added to the existing data frame containing the features.
Parameters
----------
feature_set : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance being written to a file.
filter_features : Optional[Set[str]], default=None
If only writing a subset of the features in the
FeatureSet to ``output_file``, these are the
features to include in this file.
df_features : Optional[pandas.DataFrame], default=None
If the data frame with features already exists,
then we use it and add IDs and labels; otherwise,
the feature data frame will be created from the feature set.
Returns
-------
df_features : pandas.DataFrame
The data frame constructed from the feature set.
Raises
------
ValueError
If ID column is already used as feature.
If label column is already used as feature.
"""
# create the data frame with just the features
# from the feature set, at this point
if df_features is None:
df_features = self._build_dataframe_with_features(feature_set, filter_features)
# if the id column is already in the data frame,
# then raise an error; otherwise, just add the ids
if self.id_col in df_features:
raise ValueError(f"ID column name {self.id_col} already used as feature name.")
df_features[self.id_col] = feature_set.ids
# if the the labels should exist but the column is already
# in the data frame, then raise an error; otherwise, just add the labels
if feature_set.has_labels:
if self.label_col in df_features:
raise ValueError(
f'Class column name "{self.label_col}" ' "already used as feature name."
)
df_features[self.label_col] = feature_set.labels
return df_features
def _write_data(
self,
feature_set: FeatureSet,
output_file: PathOrStr,
filter_features: Optional[Set[str]] = None,
) -> None:
"""
Write the data in CSV format.
Parameters
----------
feature_set : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance being written to a file.
output_file : :class:`skll.types.PathOrStr`
The path of the file being written to
filter_features : Optional[Set[str]], default=None
If only writing a subset of the features in the
FeatureSet to ``output_file``, these are the
features to include in this file.
"""
df = self._build_dataframe(feature_set, filter_features=filter_features)
df.to_csv(output_file, sep=self._sep, index=self._index, **self._pandas_kwargs)
[docs]
class TSVWriter(CSVWriter):
"""
Writer for writing out FeatureSets as TSV files.
Parameters
----------
path : :class:`skll.types.PathOrStr`
A path to the feature file we would like to create.
If ``subsets`` is not ``None``, this is assumed to be a string
containing the path to the directory to write the feature
files with an additional file extension specifying the file
type. For example ``/foo/.tsv``.
feature_set : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to dump to the output file.
quiet : bool, default=True
Do not print "Writing..." status message to stderr.
subsets : Optional[Dict[str, List[str]]], default=None
A mapping from subset names to lists of feature names
that are included in those sets. If given, a feature
file will be written for every subset (with the name
containing the subset name as suffix to ``path``).
Note, since string- valued features are automatically
converted into boolean features with names of the form
``FEATURE_NAME=STRING_VALUE``, when doing the
filtering, the portion before the ``=`` is all that's
used for matching. Therefore, you do not need to
enumerate all of these boolean feature names in your
mapping.
logger : Optional[logging.Logger], default=None
A logger instance to use to log messages instead of creating
a new one by default.
label_col: str, default="y"
The column name containing the label.
id_col: str, default="id"
The column name containing the ID.
pandas_kwargs : Optional[Dict[str, Any]], default=None
Arguments that will be passed directly to the `pandas` I/O reader.
"""
def __init__(
self,
path: PathOrStr,
feature_set: FeatureSet,
quiet: bool = True,
subsets: Optional[Dict[str, List[str]]] = None,
logger: Optional[logging.Logger] = None,
label_col: str = "y",
id_col: str = "id",
pandas_kwargs: Optional[Dict[str, Any]] = None,
):
"""Initialize the TSVWriter class."""
super(TSVWriter, self).__init__(
path,
feature_set,
quiet=quiet,
subsets=subsets,
logger=logger,
label_col=label_col,
id_col=id_col,
pandas_kwargs=pandas_kwargs,
)
self._sep = str("\t")
[docs]
class ARFFWriter(Writer):
"""
Writer for writing out FeatureSets as ARFF files.
Parameters
----------
path : :class:`skll.types.PathOrStr`
A path to the feature file we would like to create.
If ``subsets`` is not ``None``, this is assumed to be a string
containing the path to the directory to write the feature
files with an additional file extension specifying the file
type. For example ``/foo/.arff``.
feature_set : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to dump to the output file.
quiet : bool, default=True
Do not print "Writing..." status message to stderr.
subsets : Optional[Dict[str, List[str]]], default=None
A mapping from subset names to lists of feature names
that are included in those sets. If given, a feature
file will be written for every subset (with the name
containing the subset name as suffix to ``path``).
Note, since string- valued features are automatically
converted into boolean features with names of the form
``FEATURE_NAME=STRING_VALUE``, when doing the
filtering, the portion before the ``=`` is all that's
used for matching. Therefore, you do not need to
enumerate all of these boolean feature names in your
mapping.
logger : Optional[logging.Logger], default=None
A logger instance to use to log messages instead of creating
a new one by default.
relation : str, default='skll_relation'
The name of the relation in the ARFF file.
regression : bool, default=False
Is this an ARFF file to be used for regression?
kwargs : Optional[Dict[str, Any]]
The arguments to the ``Writer`` object being instantiated.
"""
def __init__(
self,
path: PathOrStr,
feature_set: FeatureSet,
quiet: bool = True,
subsets: Optional[Dict[str, List[str]]] = None,
logger: Optional[logging.Logger] = None,
relation="skll_relation",
regression=False,
dialect="excel-tab",
label_col="y",
id_col="id",
):
"""Initialize the ARFFWRiter class."""
self.relation = relation
self.regression = regression
self.dialect = dialect
self.label_col = label_col
self.id_col = id_col
super(ARFFWriter, self).__init__(
path, feature_set, quiet=quiet, subsets=subsets, logger=logger
)
self._dict_writer: Optional[DictWriter[str]] = None
def _write_header(
self, feature_set: FeatureSet, output_file: IO[str], filter_features: Set[str]
) -> None:
"""
Write headers to ARFF file.
Called before lines are written to file, so that headers can be written
for files that need them.
Parameters
----------
feature_set : Ignored
Not used.
output_file : IO[str]
The file being written to.
filter_features : Set[str]
If only writing a subset of the features in the
FeatureSet to ``output_file``, these are the
features to include in this file.
"""
fieldnames, _ = self._get_column_names_and_indexes(self.feat_set, filter_features)
fieldnames.append(self.id_col)
# Add relation to header
print(f"@relation '{self.relation}'\n", file=output_file)
# Loop through fields writing the header info for the ARFF file
for field in fieldnames:
field = field.replace("\\", "\\\\").replace("'", "\\'")
print(f"@attribute '{field}' numeric", file=output_file)
# Print class label header if necessary
if self.regression:
print(f"@attribute {self.label_col} numeric", file=output_file)
else:
if self.feat_set.has_labels:
sorted_features = sorted(set(self.feat_set.labels)) # type: ignore
labels_str = ",".join([str(feat) for feat in sorted_features])
labels_str = "{" + labels_str + "}"
print(f"@attribute {self.label_col} {labels_str}", file=output_file)
if self.label_col:
fieldnames.append(self.label_col)
# Create CSV writer to handle missing values for lines in data section
# and to ignore the instance values for non-numeric attributes
self._dict_writer = DictWriter(
output_file, fieldnames, restval=0, extrasaction="ignore", dialect="arff"
)
# Finish header and start data section
print("\n@data", file=output_file)
def _write_line(
self, id_: IdType, label_: LabelType, feat_dict: FeatureDict, output_file: IO[str]
) -> None:
"""
Write the current line in the file in this Writer's format.
Parameters
----------
id_ : :class:`skll.types.IdType`
The ID for the current instance.
label_ : :class:`skll.types.LabelType`
The label for the current instance.
feat_dict : :class:`skll.types.FeatureDict`
The feature dictionary for the current instance.
output_file : Ignored.
Not used.
Raises
------
ValueError
If class column name is already use as a feature
ValueError
If ID column name is already used as a feature.
"""
# Add class column to feat_dict (unless this is unlabeled data)
if self.label_col not in feat_dict:
if self.feat_set.has_labels:
feat_dict[self.label_col] = label_
else:
raise ValueError(
f'Class column name "{self.label_col}" already ' "used as feature name."
)
# Add id column to feat_dict if id is provided
if self.id_col not in feat_dict:
feat_dict[self.id_col] = id_
else:
raise ValueError(f'ID column name "{self.id_col}" already used as' " feature name.")
# Write out line
if self._dict_writer:
self._dict_writer.writerow(feat_dict)
[docs]
class NDJWriter(Writer):
"""
Writer for writing out FeatureSets as .jsonlines/.ndj files.
Parameters
----------
path : :class:`skll.types.PathOrStr`
A path to the feature file we would like to create.
If ``subsets`` is not ``None``, this is assumed to be a string
containing the path to the directory to write the feature
files with an additional file extension specifying the file
type. For example ``/foo/.ndj``.
feature_set : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to dump to the output file.
quiet : bool, default=True
Do not print "Writing..." status message to stderr.
subsets : Optional[Dict[str, List[str]]], default=None
A mapping from subset names to lists of feature names
that are included in those sets. If given, a feature
file will be written for every subset (with the name
containing the subset name as suffix to ``path``).
Note, since string- valued features are automatically
converted into boolean features with names of the form
``FEATURE_NAME=STRING_VALUE``, when doing the
filtering, the portion before the ``=`` is all that's
used for matching. Therefore, you do not need to
enumerate all of these boolean feature names in your
mapping.
logger : Optional[logging.Logger], default=None
A logger instance to use to log messages instead of creating
a new one by default.
"""
def __init__(
self,
path: PathOrStr,
feature_set: FeatureSet,
quiet: bool = True,
subsets: Optional[Dict[str, List[str]]] = None,
logger: Optional[logging.Logger] = None,
):
"""Initialize the NDJWriter class."""
super(NDJWriter, self).__init__(
path, feature_set, quiet=quiet, subsets=subsets, logger=logger
)
def _write_line(
self,
id_: IdType,
label_: Union[LabelType, np.int64, np.float64],
feat_dict: FeatureDict,
output_file: IO[str],
) -> None:
"""
Write the current line in the file in NDJ format.
Parameters
----------
id_ : :class:`skll.types.IdType`
The ID for the current instance.
label_ : :class:`skll.types.LabelType`
The label for the current instance.
feat_dict : :class:`skll.types.FeatureDict`
The feature dictionary for the current instance.
output_file : IO[str]
The file being written to.
"""
example_dict: FeatureDict = {}
# Don't try to add class column if this is label-less data
# Try to convert the label to a scalar assuming it'a numpy
# non-scalar type (e.g., int64) but if that doesn't work
# then use it as is
if self.feat_set.has_labels:
if hasattr(label_, "item"):
example_dict["y"] = label_.item()
else:
example_dict["y"] = label_
# Try to convert the ID to a scalar assuming it'a numpy
# non-scalar type (e.g., int64) but if that doesn't work
# then use it as is
if hasattr(id_, "item"):
example_dict["id"] = id_.item()
else:
example_dict["id"] = id_
example_dict["x"] = feat_dict
print(json.dumps(example_dict, sort_keys=True), file=output_file)
[docs]
class LibSVMWriter(Writer):
"""
Writer for writing out FeatureSets as LibSVM/SVMLight files.
Parameters
----------
path : :class:`skll.types.PathOrStr`
A path to the feature file we would like to create.
If ``subsets`` is not ``None``, this is assumed to be a string
containing the path to the directory to write the feature
files with an additional file extension specifying the file
type. For example ``/foo/.libsvm``.
feature_set : :class:`skll.data.featureset.FeatureSet`
The ``FeatureSet`` instance to dump to the output file.
quiet : bool, default=True
Do not print "Writing..." status message to stderr.
subsets : Optional[Dict[str, List[str]]], default=None
A mapping from subset names to lists of feature names
that are included in those sets. If given, a feature
file will be written for every subset (with the name
containing the subset name as suffix to ``path``).
Note, since string- valued features are automatically
converted into boolean features with names of the form
``FEATURE_NAME=STRING_VALUE``, when doing the
filtering, the portion before the ``=`` is all that's
used for matching. Therefore, you do not need to
enumerate all of these boolean feature names in your
mapping.
logger : Optional[logging.Logger], default=None
A logger instance to use to log messages instead of creating
a new one by default.
label_map : Optional[Dict[str, int]], default=None
A mapping from label strings to integers.
"""
LIBSVM_REPLACE_DICT = {
":": "\u2236",
"#": "\uFF03",
" ": "\u2002",
"=": "\ua78a",
"|": "\u2223",
}
def __init__(
self,
path: PathOrStr,
feature_set: FeatureSet,
quiet: bool = True,
subsets: Optional[Dict[str, List[str]]] = None,
logger: Optional[logging.Logger] = None,
label_map: Optional[Dict[Any, Any]] = None,
):
"""Initialize the LibSVMWriter class."""
self.label_map = label_map
super(LibSVMWriter, self).__init__(
path, feature_set, quiet=quiet, subsets=subsets, logger=logger
)
if self.label_map is None:
fs_labels = feature_set.labels if feature_set.has_labels else np.array([])
self.label_map = {
label: num
for num, label in enumerate(
sorted(
{
label
for label in fs_labels # type: ignore
if not isinstance(label, (int, float))
}
)
)
}
# Add fake item to vectorizer for None
self.label_map[None] = "00000"
@staticmethod
def _sanitize(name: Union[IdType, LabelType]) -> Union[IdType, LabelType]:
"""
Sanitize feature names for older feature formats.
Replace special characters in names with close unicode
equivalents to make things loadable in by LibSVM, LibLinear, or
SVMLight.
Parameters
----------
name : Union[:class:`skll.types.IdType`, :class:`skll.types.LabelType`]
Input name in which special characters are replaced with unicode
equivalents.
Returns
-------
Union[:class:`skll.types.IdType`, :class:`skll.types.LabelType`]
The sanitized name with special characters replaced.
"""
sanitized_name = name
if isinstance(sanitized_name, str):
for orig, replacement in LibSVMWriter.LIBSVM_REPLACE_DICT.items():
sanitized_name = sanitized_name.replace(orig, replacement)
return sanitized_name
def _write_line(
self, id_: IdType, label_: LabelType, feat_dict: FeatureDict, output_file: IO[str]
) -> None:
"""
Write the current line in the file in this Writer's format.
Parameters
----------
id_ : :class:`skll.types.IdType`
The ID for the current instance.
label_ : :class:`skll.types.LabelType`
The label for the current instance.
feat_dict : :class:`skll.types.FeatureDict`
The feature dictionary for the current instance.
output_file : IO[str]
The file being written to.
"""
field_values = (
sorted(
[
(self.feat_set.vectorizer.vocabulary_[field] + 1, value)
for field, value in feat_dict.items()
if Decimal(value) != 0
]
)
if self.feat_set.vectorizer
else []
)
# Print label
if self.label_map:
if label_ in self.label_map:
print(self.label_map[label_], end=" ", file=output_file)
else:
print(label_, end=" ", file=output_file)
# Print features
print(
" ".join((f"{field}:{value}" for field, value in field_values)),
end=" ",
file=output_file,
)
# Print comment with id and mappings
print("#", end=" ", file=output_file)
print(self._sanitize(id_), end="", file=output_file)
print(" |", end=" ", file=output_file)
if self.label_map:
if label_ in self.label_map:
print(
f"{self._sanitize(self.label_map[label_])}={self._sanitize(label_)}",
end=" | ",
file=output_file,
)
else:
print(" |", end=" ", file=output_file)
line = (
" ".join(
f"{self.feat_set.vectorizer.vocabulary_[field] + 1}={self._sanitize(field)}"
for field, value in feat_dict.items()
if Decimal(value) != 0
)
if self.feat_set.vectorizer
else ""
)
print(line, file=output_file)
# Constants
EXT_TO_WRITER = {
".arff": ARFFWriter,
".csv": CSVWriter,
".jsonlines": NDJWriter,
".libsvm": LibSVMWriter,
".ndj": NDJWriter,
".tsv": TSVWriter,
}