Source code for vis.analyzers.indexers.ngram

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#--------------------------------------------------------------------------------------------------
# Program Name:           vis
# Program Description:    Helps analyze music with computers.
#
# Filename:               controllers/indexers/ngram.py
# Purpose:                k-part anything n-gram Indexer
#
# Copyright (C) 2013, 2014 Christopher Antila, Alexander Morgan
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#--------------------------------------------------------------------------------------------------
"""
.. codeauthor:: Christopher Antila <christopher@antila.ca>

Indexer to find k-part any-object n-grams.
"""

# pylint: disable=pointless-string-statement

import pandas
from vis.analyzers import indexer


[docs]class NGramIndexer(indexer.Indexer):
    """
    Indexer that finds k-part n-grams from other indices.

    The indexer requires at least one "vertical" index, and supports "horizontal" indices that seem
    to "connect" instances in the vertical indices. Although we use "vertical" and "horizontal" to
    describe these index types, because the class is an abstraction of two-part interval n-grams,
    you can supply any information as either type of index. If you want one-part melodic n-grams
    for example, you should supply the relevant interval information as the "vertical" component.

    There is no relationship between the number of index types, though there must be at least one
    "vertical" index.

    The ``'horizontal'`` and ``'vertical'`` settings determine which columns of the ``score``
    :class:`DataFrame` are included in the n-gram output. They are added to the n-gram in the order
    specified, so if the ``'vertical'`` setting is
    ``[('noterest.NoteRestIndexer', '1'), ('noterest.NoteRestIndexer', '0')]``, this will put the
    lower part (with index ``'1'``) before the higher part (with index ``'0'``). Note that both the
    indexer's name and the part-combination name must be included.

    This is an example minimum ``settings`` dictionary for making interval 3-grams:::

        {'vertical': [('interval.IntervalIndexer', '0,1')],
         'horizontal': [('interval.HorizontalIntervalIndexer', '1')],
         'n': 3}

    In the output, groups of "vertical" events are normally enclosed in brackets, while groups of
    "horizontal" events are enclosed in parentheses. For cases where there is only one index in a
    particular direction, you can avoid printing the brackets or parentheses by setting the
    ``'mark singles'`` setting to ``False`` (the default is ``True``).

    If you want n-grams to terminate when finding one or several particular values, you can specify
    this with the ``'terminator'`` setting.

    To show that a horizontal event continues, we use ``'_'`` by default, but you can set this
    separately, for example to ``'P1'`` ``'0'``, as seems appropriate. Note that the default
    :class:`WorkflowManager` overrides this setting by dynamically adjusting for interval quality,
    and also offers a ``'continuer'`` setting of its own, which is passed to this indexer.

    You can also use the :class:`NGramIndexer` to collect "stacks" of single vertical events. If
    you provide indices of intervals above a lowest part, for example, these "stacks" become the
    figured bass signature of a single moment. Set ``'n'`` to ``1`` for this feature. Horizontal
    events are obviously ignored in this case.
    """

    required_score_type = 'pandas.DataFrame'

    possible_settings = ['horizontal', 'vertical', 'n', 'mark_singles', 'terminator', 'continuer']
    """
    A list of possible settings for the :class:`NGramIndexer`.

    :keyword 'horizontal': Selectors for the parts to consider as "horizontal."
    :type 'horizontal': list of (basestring, basestring) tuples
    :keyword 'vertical': Selectors for the parts to consider as "vertical."
    :type 'vertical': list of (basestring, basestring) tuples
    :keyword 'n': The number of "vertical" events per n-gram.
    :type 'n': int
    :keyword 'mark_singles': Whether to use delimiters around a direction's events when
        there is only one event in that direction (e.g., the "horizontal" maps only the activity
        of a single voice). (You may also use ``'mark singles'``).
    :type 'mark_singles': bool
    :keyword 'terminator': Do not find an n-gram with a vertical item that contains any of these
        values.
    :type 'terminator': list of basestring
    :keyword 'continuer': When there is no "horizontal" event that corresponds to a vertical
        event, this is printed instead, to show that the previous "horizontal" event continues.
    :type 'continuer': basestring
    """

    default_settings = {'mark_singles': True, 'horizontal': [], 'terminator': [], 'continuer': '_'}

    _MISSING_SETTINGS = 'NGramIndexer requires "vertical" and "n" settings'
    _N_VALUE_TOO_LOW = 'NGramIndexer requires an "n" value of at least 1'

    def __init__(self, score, settings=None):
        """
        :param score: The :class:`DataFrame` to use for preparing n-grams. You must ensure the
            :class:`DataFrame` has the columns indicated in the ``settings``, or the :meth:`run`
            method will fail.
        :type score: :class:`pandas.DataFrame`
        :param dict settings: Required and optional settings. See descriptions in
            :const:`possible_settings`.

        :raises: :exc:`RuntimeError` if ``score`` is the wrong type.
        :raises: :exc:`RuntimeError` if ``score`` is not a list of the same types.
        :raises: :exc:`RuntimeError` if required settings are not present in ``settings``.
        :raises: :exc:`RuntimeError` if ``'n'`` is less than ``1``.
        """
        # Check all required settings are present in the "settings" argument.
        if settings is None or 'vertical' not in settings or 'n' not in settings:
            raise RuntimeError(NGramIndexer._MISSING_SETTINGS)
        elif settings['n'] < 1:
            raise RuntimeError(NGramIndexer._N_VALUE_TOO_LOW)
        else:
            self._settings = {}
            self._settings['vertical'] = settings['vertical']
            self._settings['n'] = settings['n']
            self._settings['horizontal'] = (settings['horizontal'] if 'horizontal' in settings
                else NGramIndexer.default_settings['horizontal'])
            if 'mark singles' in settings:
                self._settings['mark_singles'] = settings['mark singles']
            elif 'mark_singles' in settings:
                self._settings['mark_singles'] = settings['mark_singles']
            else:
                self._settings['mark_singles'] = NGramIndexer.default_settings['mark_singles']
            self._settings['terminator'] = (settings['terminator'] if 'terminator' in settings
                else NGramIndexer.default_settings['terminator'])
            self._settings['continuer'] = (settings['continuer'] if 'continuer' in settings
                else NGramIndexer.default_settings['continuer'])

        super(NGramIndexer, self).__init__(score, None)

    @staticmethod
    def _format_thing(things, m_singles, markers=('[', ']'), terminator=None):
        """
        Format unicode objects by concatenating them with a space between and the appropriate
        grouping symbol, if relevant. This method is used by _format_vert() and _format_horiz().

        :param things: All the events for this moment.
        :type things: iterable of basestring

        :param m_singles: Whether to put marker characters around single-item iterables.
        :type m_singles: boolean

        :param markers: The "marker" strings to put around the output, if desired. Defualt is [].
        :type markers: 2-tuple of unicode

        :param terminator: If one of the events is in this iterale, raise a RuntimeError. Default
            is [None].
        :type terminator: list of unicode or None

        :returns: A unicode with a space between every event and marker characters if there is more
            than one event or m_singles is True.
        :rtype: unicode

        :raises: RuntimeWarning, if the one of the events is a "terminator."
        """
        terminator = [] if terminator is None else terminator
        post = []
        if len(things) > 1:
            post.append(markers[0])
            for obj in things:
                if obj in terminator:
                    raise RuntimeWarning('hit a terminator')
                else:
                    post.append(unicode(obj))
                    post.append(' ')
            post = post[:-1]  # remove last space
            post.append(markers[1])
        elif things[0] in terminator:
            raise RuntimeWarning('hit a terminator')
        elif m_singles:
            post.extend([markers[0], unicode(things[0]), markers[1]])
        else:
            post.append(things[0])
        return ''.join(post)

    @staticmethod
    def _format_vert(verts, m_singles, terminator=None):
        """
        Format "vertical" unicode objects by concatenating them with a space between and the
        appropriate grouping symbol, if relevant.

        :param verts: All the "vertical" events for this moment.
        :type verts: iterable of basestring

        :param m_singles: Whether to put marker characters around single-item iterables.
        :type m_singles: boolean

        :param terminator: If one of the events is in this iterale, raise a RuntimeError. Default
            is [None].
        :type terminator: list of unicode or None

        :returns: A unicode with a space between every event and marker characters if there is more
            than one event or m_singles is True.
        :rtype: unicode

        :raises: RuntimeWarning, if the one of the events is a "terminator."
        """
        return NGramIndexer._format_thing(verts, m_singles, ('[', ']'), terminator)

    @staticmethod
    def _format_horiz(horizs, m_singles, terminator=None):
        """
        Format "horizontal" unicode objects by concatenating them with a space between and the
        appropriate grouping symbol, if relevant.

        :param verts: All the "horizontal" events for this moment.
        :type verts: iterable of basestring

        :param m_singles: Whether to put marker characters around single-item iterables.
        :type m_singles: boolean

        :param terminator: If one of the events is in this iterale, raise a RuntimeError. Default
            is [None].
        :type terminator: list of unicode or None

        :returns: A unicode with a space between every event and marker characters if there is more
            than one event or m_singles is True.
        :rtype: unicode

        :raises: RuntimeWarning, if the one of the events is a "terminator."
        """
        return NGramIndexer._format_thing(horizs, m_singles, ('(', ')'), terminator)

    def _make_column_label(self):
        """
        Make the part-combination column label for the returned DataFrame's MultiIndex. This
        involves a rather complex coordination between the "vertical," "horizontal," and
        "mark_singles" settings.

        Refer to the automated tests for examples of what happens.
        """
        verts = ['{}'.format(x[1]) for x in self._settings['vertical']]
        if len(verts) > 1 or self._settings['mark_singles']:
            verts = '[{}]'.format(' '.join(verts))
        else:
            verts = ' '.join(verts)

        if 'horizontal' in self._settings and len(self._settings['horizontal']) > 0:
            horizs = ['{}'.format(x[1]) for x in self._settings['horizontal']]
            if len(horizs) > 1 or self._settings['mark_singles']:
                horizs = '({})'.format(' '.join(horizs))
            else:
                horizs = ' '.join(horizs)
            return ['{} {}'.format(verts, horizs)]
        else:
            return [verts]

[docs]    def run(self):
        """
        Make an index of k-part n-grams of anything.

        :returns: A single-column :class:`~pandas.DataFrame` with the new index.
        """
        # NOTE: in an incredible stroke of luck, the VIS 1 run() algorithm works without change
        #       for the VIS 2.0 release...
        # - So in a future 2.x-series point release, we can add "true" multidimensional functionality
        #   while retaining the existing 'horizontal' and 'vertical' method of naming the dimensions.
        #   In other words, we'll break the API at release 2.0 while retaining the algorithm, and
        #   add new features along with a new algorithm later, without breaking the API.

        post = []
        post_offsets = []

        # for the formatting methods
        m_singles = self._settings['mark_singles']
        term = self._settings['terminator']

        # Order the parts as specified. We have to track "i" and "name" separately so we have a new
        # order for the dict but can keep self._score straight. We'll use these tuples to keep
        # vertical and horizontal events separated in the DataFrame with a MultiIndex
        events = {}
        for i, name in enumerate(self._settings['vertical']):
            events[('v', i)] = self._score[name].dropna()
        for i, name in enumerate(self._settings['horizontal']):
            events[('h', i)] = self._score[name].dropna()

        # Make the MultiIndex and DataFrame with all events
        events = pandas.DataFrame(events, columns=pandas.MultiIndex.from_tuples(events.keys()))

        # Fill in all "vertical" NaN values with the previous value
        for i in events['v'].columns:
            # NB: still have to test the fix, as stated in issue 261
            events.update(events.loc[:, ('v', i)].fillna(method='ffill'))

        # Fill in all "horizontal" NaN values with the continuer
        if 'h' in events:
            for i in events['h'].columns:
                # NB: still have to test the fix, as stated in issue 261
                events.update(events.loc[:, ('h', i)].fillna(value=self._settings['continuer']))

        # Iterate the offsets
        for i in xrange(len(events)):
            loop_post = None
            try:
                # first vertical event
                loop_post = [NGramIndexer._format_vert(list(events['v'].iloc[i].sort_index()),
                                                       m_singles,
                                                       term)]
            except RuntimeWarning:  # we hit a terminator
                continue
            try:
                for j in xrange(self._settings['n'] - 1):  # iterate to the end of 'n'
                    k = i + j + 1  # the index we need
                    ilp = None  # it means "Inner Loop Post"
                    if 'h' in events:  # are there "horizontal" events?
                        ilp = [' ',
                               NGramIndexer._format_horiz(list(events['h'].iloc[k].sort_index()),
                                                          m_singles),
                               ' ',
                               NGramIndexer._format_vert(list(events['v'].iloc[k].sort_index()),
                                                         m_singles,
                                                         term)]
                    else:
                        ilp = [' ',
                               NGramIndexer._format_vert(list(events['v'].iloc[k].sort_index()),
                                                         m_singles,
                                                         term)]
                    loop_post.extend(ilp)
            except (KeyError, IndexError, RuntimeWarning) as the_err:
                if isinstance(the_err, (IndexError, KeyError)):  # end of inputted Series
                    break
                else:  # we hit a terminator
                    continue
            post.append(''.join(loop_post))
            post_offsets.append(events.index[i])

        # prepare the part-combination labels
        combos = self._make_column_label()
        return self.make_return(combos, [pandas.Series(post, post_offsets)])