Source code for vis.analyzers.indexers.ngram

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#--------------------------------------------------------------------------------------------------
# Program Name:           vis
# Program Description:    Helps analyze music with computers.
#
# Filename:               controllers/indexers/ngram.py
# Purpose:                k-part anything n-gram Indexer
#
# Copyright (C) 2013, 2014 Christopher Antila, Alexander Morgan
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#--------------------------------------------------------------------------------------------------
"""
.. codeauthor:: Christopher Antila <crantila@fedoraproject.org>

Indexer to find k-part any-object n-grams.
"""

# Turn off "string statement has no effect" warning; the strings are for Sphinx!
# pylint: disable=W0105

import pandas
from vis.analyzers import indexer


[docs]class NGramIndexer(indexer.Indexer): """ Indexer that finds k-part n-grams from other indices. The indexer requires at least one "vertical" index, and supports "horizontal" indices that seem to "connect" instances in the vertical indices. Although we use "vertical" and "horizontal" to describe these index types, because the class is an abstraction of two-part interval n-grams, you can supply any information as either type of index. If you want one-part melodic n-grams for example, you should supply the relevant interval information as the "vertical" component. There is no relationship between the number of index types, though there must be at least one "vertical" index. The settings given to :meth:`__init__` specify which index values in ``score`` are horizontal or vertical intervals. They will be added in the n-gram in the order specified, so if the ``u'vertical'`` setting is ``[4, 1, 3]`` for lists of intervals, then for each\ vertical event, objects will be listed in that order. In the output, groups of "vertical" events are enclosed in brackets, while groups of "horizontal" events are enclosed in parentheses. For cases where there is only one index in a particular direction, you can avoid printing the brackets or parentheses by setting the ``u'mark singles'`` setting to False (though the default is ``True``). If you want n-grams to terminate when finding one or several particular values, you can specify this with the ``u'terminator'`` setting. To show that a horizontal event continues, we use ``u'_'`` by default, but you can set this separately, for example to ``u'P1'`` ``u'0'``, as seems appropriate. Note that the default :class:`WorkflowManager` is set to override this setting by dynamically adjusting to whether interval quality is set to True or False or if the user chooses to pass a custom string for this setting. You can also use the :class:`NGramIndexer` to collect "stacks" of single vertical events. If you provide indices of intervals above a lowest part, for example, these "stacks" become the figured bass signature of a single moment. Set :obj:`u'n'` to 1 for this feature. Horizontal events are obviously ignored. """ required_score_type = pandas.Series "The :class:`NGramIndexer` requires :class:`pandas.Series` as input." possible_settings = [u'horizontal', u'vertical', u'n', u'mark_singles', u'terminator', u'continuer'] """ A list of possible settings for the :class:`NGramIndexer`. :keyword u'horizontal': The parts to consider as "horizontal." :type u'horizontal': ``list`` of ``int`` :keyword u'vertical': The parts to consider as "vertical." :type u'vertical': ``list`` of ``int`` :keyword u'n': The number of "vertical" events per n-gram. :type u'n': ``int`` :keyword u'mark_singles': Whether to use delimiters around a direction's events when there is only one event in that direction (e.g., the "horizontal" maps only the activity of a single voice). (You may also use ``u'mark singles'``). :type u'mark_singles': ``bool`` :keyword u'terminator': Do not find an n-gram with a vertical item that contains any of these values. :type u'terminator': ``list`` of ``basestring`` :keyword u'continuer': When there is no "horizontal" event that corresponds to a vertical event, this is printed instead, to show that the previous "horizontal" event continues. :type u'continuer': ``basestring`` """ default_settings = {u'mark_singles': True, u'horizontal': [], u'terminator': [], u'continuer': u'_'} "A :obj:`dict` of default settings for the :class:`NGramIndexer`." def __init__(self, score, settings=None): """ :param score: A list of the "horizontal" and "vertical" indices to use for n-grams. You \ can put the "horizontal" and "vertical" indices anywhere in the list, so long as you \ use settings to specify the order. :type score: ``list`` of :class:`pandas.Series` :param settings: Required and optional settings. See descriptions in \ :const:`possible_settings`. :type settings: ``dict`` :raises: :exc:`RuntimeError` if ``score`` is the wrong type. :raises: :exc:`RuntimeError` if ``score`` is not a list of the same types. :raises: :exc:`RuntimeError` if required settings are not present in ``settings``. :raises: :exc:`RuntimeError` if ``u'n'`` is less than ``1``. """ # Check all required settings are present in the "settings" argument. if settings is None or u'vertical' not in settings or u'n' not in settings: msg = u'NGramIndexer requires "vertical" and "n" settings' raise RuntimeError(msg) elif settings[u'n'] < 1: msg = u'NGramIndexer requires an "n" value of at least 1' raise RuntimeError(msg) else: self._settings = {} self._settings[u'vertical'] = settings[u'vertical'] self._settings[u'n'] = settings[u'n'] self._settings[u'horizontal'] = settings[u'horizontal'] if u'horizontal' in settings \ else NGramIndexer.default_settings[u'horizontal'] if u'mark singles' in settings: self._settings[u'mark_singles'] = settings[u'mark singles'] elif u'mark_singles' in settings: self._settings[u'mark_singles'] = settings[u'mark_singles'] else: self._settings[u'mark_singles'] = NGramIndexer.default_settings[u'mark_singles'] self._settings[u'terminator'] = settings[u'terminator'] if u'terminator' in settings \ else NGramIndexer.default_settings[u'terminator'] self._settings[u'continuer'] = settings[u'continuer'] if u'continuer' in settings \ else NGramIndexer.default_settings[u'continuer'] # Change "TemplateIndexer" to the current class name. The superclass will handle the # "score" and "mpc" arguments, but you should have processed "settings" above, so it should # not be sent to the superclass constructor. super(NGramIndexer, self).__init__(score, None) # not using it self._indexer_func = None @staticmethod def _format_thing(things, m_singles, markers=(u'[', u']'), terminator=None): """ Format unicode objects by concatenating them with a space between and the appropriate grouping symbol, if relevant. This method is used by _format_vert() and _format_horiz(). :param things: All the events for this moment. :type things: iterable of basestring :param m_singles: Whether to put marker characters around single-item iterables. :type m_singles: boolean :param markers: The "marker" strings to put around the output, if desired. Defualt is []. :type markers: 2-tuple of unicode :param terminator: If one of the events is in this iterale, raise a RuntimeError. Default is [None]. :type terminator: list of unicode or None :returns: A unicode with a space between every event and marker characters if there is more than one event or m_singles is True. :rtype: unicode :raises: RuntimeWarning, if the one of the events is a "terminator." """ terminator = [] if terminator is None else terminator post = [] if len(things) > 1: post.append(markers[0]) for obj in things: if obj in terminator: raise RuntimeWarning(u'hit a terminator') else: post.append(unicode(obj)) post.append(u' ') post = post[:-1] # remove last space post.append(markers[1]) elif things[0] in terminator: raise RuntimeWarning(u'hit a terminator') elif m_singles: post.extend([markers[0], unicode(things[0]), markers[1]]) else: post.append(things[0]) return u''.join(post) @staticmethod def _format_vert(verts, m_singles, terminator=None): """ Format "vertical" unicode objects by concatenating them with a space between and the appropriate grouping symbol, if relevant. :param verts: All the "vertical" events for this moment. :type verts: iterable of basestring :param m_singles: Whether to put marker characters around single-item iterables. :type m_singles: boolean :param terminator: If one of the events is in this iterale, raise a RuntimeError. Default is [None]. :type terminator: list of unicode or None :returns: A unicode with a space between every event and marker characters if there is more than one event or m_singles is True. :rtype: unicode :raises: RuntimeWarning, if the one of the events is a "terminator." """ return NGramIndexer._format_thing(verts, m_singles, (u'[', u']'), terminator) @staticmethod def _format_horiz(horizs, m_singles, terminator=None): """ Format "horizontal" unicode objects by concatenating them with a space between and the appropriate grouping symbol, if relevant. :param verts: All the "horizontal" events for this moment. :type verts: iterable of basestring :param m_singles: Whether to put marker characters around single-item iterables. :type m_singles: boolean :param terminator: If one of the events is in this iterale, raise a RuntimeError. Default is [None]. :type terminator: list of unicode or None :returns: A unicode with a space between every event and marker characters if there is more than one event or m_singles is True. :rtype: unicode :raises: RuntimeWarning, if the one of the events is a "terminator." """ return NGramIndexer._format_thing(horizs, m_singles, (u'(', u')'), terminator)
[docs] def run(self): """ Make an index of k-part n-grams of anything. :returns: A single-item list with the new index. :rtype: ``list`` of :class:`pandas.Series` """ # TODO: pylint says there are too many branches; it's right post = [] post_offsets = [] # for the formatting methods m_singles = self._settings[u'mark_singles'] term = self._settings[u'terminator'] # Order the parts as specified. We have to track "i" and "name" separately so we have a new # order for the dict but can keep self._score straight. We'll use these tuples to keep # vertical and horizontal events separated in the DataFrame with a MultiIndex events = {} for i, name in enumerate(self._settings[u'vertical']): events[(u'v', i)] = self._score[name] for i, name in enumerate(self._settings[u'horizontal']): events[(u'h', i)] = self._score[name] # Make the MultiIndex and DataFrame with all events events = pandas.DataFrame(events, columns=pandas.MultiIndex.from_tuples(events.keys())) # Fill in all "vertical" NaN values with the previous value for i in events[u'v'].columns: # NB: still have to test the fix, as stated in issue 261 events.update(events.loc[:,(u'v', i)].fillna(method=u'ffill')) # Fill in all "horizontal" NaN values with the continuer if u'h' in events: for i in events[u'h'].columns: # NB: still have to test the fix, as stated in issue 261 events.update(events.loc[:,(u'h', i)].fillna(value=self._settings[u'continuer'])) # Iterate the offsets for i in xrange(len(events)): loop_post = None try: # first vertical event loop_post = [NGramIndexer._format_vert(list(events[u'v'].iloc[i].sort_index()), m_singles, term)] except RuntimeWarning: # we hit a terminator continue try: for j in xrange(self._settings[u'n'] - 1): # iterate to the end of 'n' k = i + j + 1 # the index we need ilp = None # it means "Inner Loop Post" if u'h' in events: # are there "horizontal" events? ilp = [u' ', NGramIndexer._format_horiz(list(events[u'h'].iloc[k].sort_index()), m_singles), u' ', NGramIndexer._format_vert(list(events[u'v'].iloc[k].sort_index()), m_singles, term)] else: ilp = [u' ', NGramIndexer._format_vert(list(events[u'v'].iloc[k].sort_index()), m_singles, term)] loop_post.extend(ilp) except (KeyError, IndexError, RuntimeWarning) as the_err: if isinstance(the_err, (IndexError, KeyError)): # end of inputted Series break else: # we hit a terminator continue post.append(u''.join(loop_post)) post_offsets.append(events.index[i]) return [pandas.Series(post, post_offsets)]