Source code for h5features.h5features

# Copyright 2014-2016 Thomas Schatz, Mathieu Bernard, Roland Thiolliere
#
# This file is part of h5features.
#
# h5features is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# h5features is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with h5features.  If not, see <http://www.gnu.org/licenses/>.

"""Provides the read() and write() wrapper functions.

.. note::

   For compatibility with h5features 1.0, this legacy top-level API
   have been conserved in this module. Except for use in legacy code,
   it is **better not to use it**. Use instead the `h5features.writer`
   and `h5features.reader` modules.

"""

from .data import Data
from .reader import Reader
from .writer import Writer


[docs]def read(filename, groupname=None, from_item=None, to_item=None,
         from_time=None, to_time=None, index=None):
    """Reads in a h5features file.

    :param str filename: Path to a hdf5 file potentially serving as a
        container for many small files

    :param str groupname: HDF5 group to read the data from. If None,
        guess there is one and only one group in `filename`.

    :param str from_item: Optional. Read the data starting from this
        item. (defaults to the first stored item)

    :param str to_item: Optional. Read the data until reaching the
        item. (defaults to from_item if it was specified and to the
        last stored item otherwise)

    :param float from_time: Optional. (defaults to the beginning time
        in from_item) the specified times are included in the output

    :param float to_time: Optional. (defaults to the ending time in
        to_item) the specified times are included in the output

    :param int index: Optional. For faster access. TODO Document and
        test this.

    :return: A tuple (times, features) such as:

        * time is a dictionary of 1D arrays values (keys are items).

        * features: A dictionary of 2D arrays values (keys are
          items) with the 'feature' dimension along the columns and the
          'time' dimension along the lines.

    .. note:: Note that all the files that are present on disk between
        to_item and from_item will be loaded and returned. It's the
        responsibility of the user to make sure that it will fit into
        RAM memory.

    """
    # TODO legacy read from index not implemented
    if index is not None:
        raise NotImplementedError

    reader = Reader(filename, groupname)
    data = (reader.read(from_item, to_item, from_time, to_time)
            if index is None else reader.index_read(index))
    return data.dict_labels(), data.dict_features()


[docs]def write(filename, groupname, items, times, features,
          dformat='dense', chunk_size=0.1, sparsity=0.1, mode='a'):
    """Write h5features data in a HDF5 file.

    This function is a wrapper to the Writer class. It has three purposes:

    * Check parameters for errors (see details below),
    * Create Items, Times and Features objects
    * Send them to the Writer.

    :param str filename: HDF5 file to be writted, potentially serving
        as a container for many small files. If the file does not
        exist, it is created. If the file is already a valid HDF5
        file, try to append the data in it.

    :param str groupname: Name of the group to write the data in, or
        to append the data to if the group already exists in the file.

    :param items: List of files from which the features where
        extracted. Items must not contain duplicates.
    :type items: list of str

    :param times: Time value for the features array. Elements of
        a 1D array are considered as the center of the time window
        associated with the features. A 2D array must have 2 columns
        corresponding to the begin and end timestamps of the features
        time window.
    :type times: list of  1D or 2D numpy arrays

    :param features: Features should have
        time along the lines and features along the columns
        (accomodating row-major storage in hdf5 files).
    :type features: list of 2D numpy arrays

    :param str dformat: Optional. Which format to store the features
        into (sparse or dense). Default is dense.

    :param float chunk_size: Optional. In Mo, tuning parameter
        corresponding to the size of a chunk in the h5file. Ignored if
        the file already exists.

    :param float sparsity: Optional. Tuning parameter corresponding to
        the expected proportion (in [0, 1]) of non-zeros elements on
        average in a single frame.

    :param char mode: Optional. The mode for overwriting an existing
        file, 'a' to append data to the file, 'w' to overwrite it

    :raise IOError: if the filename is not valid or parameters are
        inconsistent.

    :raise NotImplementedError: if dformat == 'sparse'

    """
    # Prepare the data, raise on error
    sparsity = sparsity if dformat == 'sparse' else None
    data = Data(items, times, features, sparsity=sparsity, check=True)

    # Write all that stuff in the HDF5 file's specified group
    Writer(filename, chunk_size=chunk_size).write(data, groupname, append=True)


[docs]def simple_write(filename, group, times, features, item='item', mode='a'):
    """Simplified version of `write()` when there is only one item."""
    write(filename, group, [item], [times], [features], mode=mode)