Source code for h5features.data

# Copyright 2014-2016 Thomas Schatz, Mathieu Bernard, Roland Thiolliere
#
# This file is part of h5features.
#
# h5features is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# h5features is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with h5features.  If not, see <http://www.gnu.org/licenses/>.

"""Provides the Data class to the h5features package."""

from .items import Items
from .labels import Labels
from .features import Features, SparseFeatures
from .index import create_index, write_index


[docs]class Data(object): """This class manages h5features data.""" def __init__(self, items, labels, features, sparsity=None, check=True): if check and not (len(items) == len(labels) == len(features)): raise ValueError('all entries must have the same length ({} {} {})' .format(len(items), len(labels), len(features))) self._entries = {} self._entries['items'] = Items(items, check) self._entries['labels'] = Labels(labels, check) self._entries['features'] = ( Features(features, check) if not sparsity else SparseFeatures(features, sparsity, check)) def __eq__(self, other): return self._entries == other._entries def _data(self, key): return self._entries[key].data def _dict_entry(self, key): return dict(zip(self.items(), self._data(key)))
[docs] def is_empty(self): return len(self.items()) == 0
[docs] def clear(self): """Erase stored data""" for e in self._entries.values(): e.clear()
[docs] def append(self, data): """Append a Data instance to self""" for k in self._entries.keys(): self._entries[k].append(data._entries[k])
[docs] def items(self): """Returns the stored items as a list of str.""" return self._data('items')
[docs] def labels(self): """Returns the stored labels as a list.""" return self._data('labels')
[docs] def features(self): """Returns the stored features as a list of numpy arrays.""" return self._data('features')
[docs] def dict_features(self): """Returns a items/features dictionary.""" return self._dict_entry('features')
[docs] def dict_labels(self): """Returns a items/labels dictionary.""" return self._dict_entry('labels')
[docs] def init_group(self, group, chunk_size): """Initializes a HDF5 group compliant with the stored data. This method creates the datasets 'items', 'labels', 'features' and 'index' and leaves them empty. :param h5py.Group group: The group to initializes. :param float chunk_size: The size of a chunk in the file (in MB). """ create_index(group, chunk_size) self._entries['items'].create_dataset(group, chunk_size) self._entries['features'].create_dataset(group, chunk_size) # chunking the labels depends on features chunks self._entries['labels'].create_dataset( group, self._entries['features'].nb_per_chunk)
[docs] def is_appendable_to(self, group): """Returns True if the data can be appended in a given group.""" # First check only the names if not all([k in group for k in self._entries.keys()]): return False # If names are matching, check the contents for k in self._entries.keys(): if not self._entries[k].is_appendable_to(group): return False return True
[docs] def write_to(self, group, append=False): """Write the data to the given group. :param h5py.Group group: The group to write the data on. It is assumed that the group is already existing or initialized to store h5features data (i.e. the method ``Data.init_group`` have been called. :param bool append: If False, any existing data in the group is overwrited. If True, the data is appended to the end of the group and we assume ``Data.is_appendable_to`` is True for this group. """ write_index(self, group, append) self._entries['items'].write_to(group) self._entries['features'].write_to(group, append) self._entries['labels'].write_to(group)