Source code for h5features.features

# Copyright 2014-2016 Thomas Schatz, Mathieu Bernard, Roland Thiolliere
#
# This file is part of h5features.
#
# h5features is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# h5features is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with h5features.  If not, see <http://www.gnu.org/licenses/>.

"""Provides Features class to the h5features module."""

import numpy as np
import scipy.sparse as sp

from .entry import Entry
from .entry import nb_per_chunk


[docs]def contains_empty(features):
    """Check features data are not empty

    :param features: The features data to check.
    :type features: list of numpy arrays.

    :return: True if one of the array is empty, False else.

    """
    if not features:
        return True
    for feature in features:
        if feature.shape[0] == 0:
            return True
    return False


[docs]def parse_dformat(dformat, check=True):
    """Return `dformat` or raise if it is not 'dense' or 'sparse'"""
    if check and dformat not in ['dense', 'sparse']:
        raise IOError(
            "{} is a bad features format, please choose 'dense' or 'sparse'"
            .format(dformat))
    return dformat


[docs]def parse_dtype(features, check=True):
    """Return the features scalar type, raise if error

    Raise IOError if all features have not the same data type.
    Return dtype, the features scalar type.

    """
    dtype = features[0].dtype
    if check:
        types = [x.dtype for x in features]
        if not all([t == dtype for t in types]):
            raise IOError('features must be homogeneous')
    return dtype


[docs]def parse_dim(features, check=True):
    """Return the features dimension, raise if error

    Raise IOError if features have not all the same positive
    dimension.  Return dim (int), the features dimension.

    """
    # try:
    dim = features[0].shape[1]
    # except IndexError:
    #     dim = 1

    if check and not dim > 0:
        raise IOError('features dimension must be strictly positive')
    if check and not all([d == dim for d in [x.shape[1] for x in features]]):
        raise IOError('all files must have the same feature dimension')
    return dim


[docs]class Features(Entry):
    """This class manages features in h5features files

    :param data: Features must have time along the lines and
        features along the columns (accomodating row-major storage
        in hdf5 files).
    :type data: list of 2D numpy arrays

    :param bool sparsetodense: If True convert sparse matrices to
        dense when writing. Used for compatibility with 1.0.

    :raise IOError: if features are badly formatted.

    """
    def __init__(self, data, check=True, sparsetodense=False):
        if check:
            if contains_empty(data):
                raise IOError('all features must be non-empty')

        # raise on error
        dtype = parse_dtype(data, check)
        dim = parse_dim(data, check)
        super(Features, self).__init__('features', data, dim, dtype)

        self.dformat = 'dense'
        self.sparsetodense = sparsetodense

    def __eq__(self, other):
        if self is other:
            return True
        try:
            ndata = len(self.data)
            # check the little attributes
            if not (self.dformat == other.dformat and
                    self.sparsetodense == other.sparsetodense and
                    self.name == other.name and
                    self.dim == other.dim and
                    self.dtype == other.dtype and
                    ndata == len(other.data)):
                return False
            # check big data
            for i in range(ndata):
                if not (self.data[i] == other.data[i]).all():
                    return False
            return True
        except AttributeError:
            return False

[docs]    def is_sparse(self):
        """Return True if features are sparse matrices"""
        return self.dformat == 'sparse'

[docs]    def is_appendable_to(self, group):
        """Return True if features are appendable to a HDF5 group"""
        return (group.attrs['format'] == self.dformat and
                group[self.name].dtype == self.dtype and
                # We use a method because dim differs in dense and sparse.
                self._group_dim(group) == self.dim)

    def _group_dim(self, group):
        """Return the dimension of features stored in a HDF5 group"""
        try:
            return group[self.name].shape[1]
        except IndexError:
            return 1

[docs]    def create_dataset(self, group, chunk_size):
        """Initialize the features subgoup"""
        group.attrs['format'] = self.dformat
        super(Features, self)._create_dataset(group, chunk_size)

        # TODO attribute declared outside __init__ is not safe. Used
        # because Labels.create_dataset need it.
        self.nb_per_chunk = nb_per_chunk(
            self.dtype.itemsize, self.dim, chunk_size)

[docs]    def write_to(self, group, append=False):
        """Write stored features to a given group"""
        if self.sparsetodense:
            self.data = [x.todense() if sp.issparse(x) else x
                         for x in self.data]

        nframes = sum([d.shape[0] for d in self.data])
        dim = self._group_dim(group)
        feats = np.concatenate(self.data, axis=0)

        if append:
            nframes_group = group[self.name].shape[0]
            group[self.name].resize(nframes_group + nframes, axis=0)
            if dim == 1:
                group[self.name][nframes_group:] = feats
            else:
                group[self.name][nframes_group:, :] = feats
        else:
            group[self.name].resize(nframes, axis=0)
            group[self.name][...] = feats if dim == 1 else feats


[docs]class SparseFeatures(Features):
    """This class is specialized for managing sparse matrices as features"""

    def __init__(self, data, sparsity, check=True):
        self.dformat = 'sparse'

        if sparsity < 0 or sparsity > 1:
            raise ValueError('sparsity must be in [0, 1]')
        self.sparsity = sparsity

        super(SparseFeatures, self).__init__(data, check)
        raise NotImplementedError(
            'writing sparse features is not implemented')

    def __eq__(self, other):
        try:
            return (self.sparsity == other.sparsity and
                    super(SparseFeatures, self).__eq__(other))
        except AttributeError:
            return False

    def _group_dim(self, group):
        """Return the dimension of features stored in a HDF5 group"""
        return group.attrs['dim']

[docs]    def create_dataset(self, group, chunk_size):
        """Initializes sparse specific datasets"""
        group.attrs['format'] = self.dformat
        group.attrs['dim'] = self.dim

        # for storing sparse data we don't use the self.nb_per_chunk,
        # which is only used by the Writer to determine times chunking.
        per_chunk = nb_per_chunk(self.dtype.itemsize, 1, chunk_size)

        group.create_dataset('coordinates', (0, 2), dtype=np.float64,
                             chunks=(per_chunk, 2), maxshape=(None, 2))

        group.create_dataset(self.name, (0,), dtype=self.dtype,
                             chunks=(per_chunk,), maxshape=(None,))

        dtype = np.int64
        chunks = (nb_per_chunk(np.dtype(dtype).itemsize, 1, chunk_size),)
        group.create_dataset('frames', (0,), dtype=dtype,
                             chunks=chunks, maxshape=(None,))

        # Needed by Times.create_dataset
        self.nb_per_chunk = nb_per_chunk(
            self.dtype.itemsize,
            int(round(self.sparsity*self.dim)),
            chunk_size)

[docs]    def write_to(self, group, append=False):
        raise NotImplementedError
        # TODO implement this
        # 1- concatenation. put them in right format if they aren't already
        # are_sparse = [x.isspmatrix_coo() for x in features]
        # if not(all(are_sparse)):
        #    for x in features:
        #        if not(x.isspmatrix_coo()):
        #            x = sp.coo_matrix(x)
        # need to get the coo by line ...

        # 2- writing
        # nb, = g['features'].shape
        # g['feature'].resize((nb+features.shape[0],))
        # g['features'][nb:] = features
        # g['coordinates'].resize((nb+features.shape[0],2))
        # g['coordinates'][nb:,:] = coordinates
        # nb, = g['frames'].shape
        # g['frames'].resize((nb+frames.shape[0],))
        # g['frames'][nb:] = frames