# Copyright 2014-2016 Thomas Schatz, Mathieu Bernard, Roland Thiolliere
#
# This file is part of h5features.
#
# h5features is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# h5features is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with h5features. If not, see <http://www.gnu.org/licenses/>.
"""Provides Features class to the h5features module."""
import numpy as np
import scipy.sparse as sp
from .entry import Entry
from .entry import nb_per_chunk
[docs]def contains_empty(features):
"""Check features data are not empty
:param features: The features data to check.
:type features: list of numpy arrays.
:return: True if one of the array is empty, False else.
"""
if not features:
return True
for feature in features:
if feature.shape[0] == 0:
return True
return False
[docs]def parse_dtype(features, check=True):
"""Return the features scalar type, raise if error
Raise IOError if all features have not the same data type.
Return dtype, the features scalar type.
"""
dtype = features[0].dtype
if check:
types = [x.dtype for x in features]
if not all([t == dtype for t in types]):
raise IOError('features must be homogeneous')
return dtype
[docs]def parse_dim(features, check=True):
"""Return the features dimension, raise if error
Raise IOError if features have not all the same positive
dimension. Return dim (int), the features dimension.
"""
# try:
dim = features[0].shape[1]
# except IndexError:
# dim = 1
if check and not dim > 0:
raise IOError('features dimension must be strictly positive')
if check and not all([d == dim for d in [x.shape[1] for x in features]]):
raise IOError('all files must have the same feature dimension')
return dim
[docs]class Features(Entry):
"""This class manages features in h5features files
:param data: Features must have time along the lines and
features along the columns (accomodating row-major storage
in hdf5 files).
:type data: list of 2D numpy arrays
:param bool sparsetodense: If True convert sparse matrices to
dense when writing. Used for compatibility with 1.0.
:raise IOError: if features are badly formatted.
"""
def __init__(self, data, check=True, sparsetodense=False):
if check:
if contains_empty(data):
raise IOError('all features must be non-empty')
# raise on error
dtype = parse_dtype(data, check)
dim = parse_dim(data, check)
super(Features, self).__init__('features', data, dim, dtype)
self.dformat = 'dense'
self.sparsetodense = sparsetodense
def __eq__(self, other):
if self is other:
return True
try:
ndata = len(self.data)
# check the little attributes
if not (self.dformat == other.dformat and
self.sparsetodense == other.sparsetodense and
self.name == other.name and
self.dim == other.dim and
self.dtype == other.dtype and
ndata == len(other.data)):
return False
# check big data
for i in range(ndata):
if not (self.data[i] == other.data[i]).all():
return False
return True
except AttributeError:
return False
[docs] def is_sparse(self):
"""Return True if features are sparse matrices"""
return self.dformat == 'sparse'
[docs] def is_appendable_to(self, group):
"""Return True if features are appendable to a HDF5 group"""
return (group.attrs['format'] == self.dformat and
group[self.name].dtype == self.dtype and
# We use a method because dim differs in dense and sparse.
self._group_dim(group) == self.dim)
def _group_dim(self, group):
"""Return the dimension of features stored in a HDF5 group"""
try:
return group[self.name].shape[1]
except IndexError:
return 1
[docs] def create_dataset(self, group, chunk_size):
"""Initialize the features subgoup"""
group.attrs['format'] = self.dformat
super(Features, self)._create_dataset(group, chunk_size)
# TODO attribute declared outside __init__ is not safe. Used
# because Labels.create_dataset need it.
self.nb_per_chunk = nb_per_chunk(
self.dtype.itemsize, self.dim, chunk_size)
[docs] def write_to(self, group, append=False):
"""Write stored features to a given group"""
if self.sparsetodense:
self.data = [x.todense() if sp.issparse(x) else x
for x in self.data]
nframes = sum([d.shape[0] for d in self.data])
dim = self._group_dim(group)
feats = np.concatenate(self.data, axis=0)
if append:
nframes_group = group[self.name].shape[0]
group[self.name].resize(nframes_group + nframes, axis=0)
if dim == 1:
group[self.name][nframes_group:] = feats
else:
group[self.name][nframes_group:, :] = feats
else:
group[self.name].resize(nframes, axis=0)
group[self.name][...] = feats if dim == 1 else feats
[docs]class SparseFeatures(Features):
"""This class is specialized for managing sparse matrices as features"""
def __init__(self, data, sparsity, check=True):
self.dformat = 'sparse'
if sparsity < 0 or sparsity > 1:
raise ValueError('sparsity must be in [0, 1]')
self.sparsity = sparsity
super(SparseFeatures, self).__init__(data, check)
raise NotImplementedError(
'writing sparse features is not implemented')
def __eq__(self, other):
try:
return (self.sparsity == other.sparsity and
super(SparseFeatures, self).__eq__(other))
except AttributeError:
return False
def _group_dim(self, group):
"""Return the dimension of features stored in a HDF5 group"""
return group.attrs['dim']
[docs] def create_dataset(self, group, chunk_size):
"""Initializes sparse specific datasets"""
group.attrs['format'] = self.dformat
group.attrs['dim'] = self.dim
# for storing sparse data we don't use the self.nb_per_chunk,
# which is only used by the Writer to determine times chunking.
per_chunk = nb_per_chunk(self.dtype.itemsize, 1, chunk_size)
group.create_dataset('coordinates', (0, 2), dtype=np.float64,
chunks=(per_chunk, 2), maxshape=(None, 2))
group.create_dataset(self.name, (0,), dtype=self.dtype,
chunks=(per_chunk,), maxshape=(None,))
dtype = np.int64
chunks = (nb_per_chunk(np.dtype(dtype).itemsize, 1, chunk_size),)
group.create_dataset('frames', (0,), dtype=dtype,
chunks=chunks, maxshape=(None,))
# Needed by Times.create_dataset
self.nb_per_chunk = nb_per_chunk(
self.dtype.itemsize,
int(round(self.sparsity*self.dim)),
chunk_size)
[docs] def write_to(self, group, append=False):
raise NotImplementedError
# TODO implement this
# 1- concatenation. put them in right format if they aren't already
# are_sparse = [x.isspmatrix_coo() for x in features]
# if not(all(are_sparse)):
# for x in features:
# if not(x.isspmatrix_coo()):
# x = sp.coo_matrix(x)
# need to get the coo by line ...
# 2- writing
# nb, = g['features'].shape
# g['feature'].resize((nb+features.shape[0],))
# g['features'][nb:] = features
# g['coordinates'].resize((nb+features.shape[0],2))
# g['coordinates'][nb:,:] = coordinates
# nb, = g['frames'].shape
# g['frames'].resize((nb+frames.shape[0],))
# g['frames'][nb:] = frames