Source code for vis.analyzers.indexers.ngram

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -------------------------------------------------------------------- #
# Program Name:           vis
# Program Description:    Helps analyze music with computers.
#
# Filename:               analyzers/indexers/ngram.py
# Purpose:                k-part anything n-gram Indexer
#
# Copyright (C) 2013-2016 Alexander Morgan, Christopher Antila
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public 
# License along with this program. If not, see 
# <http://www.gnu.org/licenses/>.
# -------------------------------------------------------------------- #
"""
.. codeauthor:: Alexander Morgan
.. codeauthor:: Christopher Antila <christopher@antila.ca>

Indexer to find k-part any-object n-grams. This file is a 
re-implimentation of the previous ngram_indexer.py file.

"""

# pylint: disable=pointless-string-statement

import pandas
from vis.analyzers import indexer


[docs]class NGramIndexer(indexer.Indexer): """ Indexer that finds k-part n-grams from other indices. The indexer requires at least one "vertical" index, and supports "horizontal" indices that seem to "connect" instances in the vertical indices. Although we use "vertical" and "horizontal" to describe these index types, because the class is an abstraction of two-part interval n-grams, you can supply any information as either type of index. If you want one-part melodic n-grams for example, you should supply the relevant interval information as the "vertical" component. The "vertical" and "horizontal" indices can contain an arbitrary number of observations that can get condensed into one value or kept separate in different columns. There is no relationship between the number of index types, though there must be at least one "vertical" index. The ``'vertical'`` and ``'horizontal'`` settings determine which columns of the dataframes in ``score`` are included in the n-gram output. ``score`` is a list of two dataframes, the vertical observations :class:`DataFrame` and the horizontal observations :class:`DataFrame`. The format of the vertical and horizontal settings is very important and will decide the structure of the resulting n-gram results. Both the vertical and horizontal settings should be a list of tuples. If the optional horizontal setting is passed, its list should be of the same length as that of the vertical setting. Inside of each tuple, enter the column names of the observations that you want to include in each value. For example, if you want to make 3-grams of notes in the tenor in a four-voice choral, use the following settings (NB: there is no horizontal element in this simple query so no horizontal setting is passed. In this scenario you would need to pass the noterest indexer results as the only dataframe in the "score" list of dataframes.): >>> settings = { 'n': 3, 'vertical': [('2',)] } If you want to look at the 4-grams in the interval pairs between the bass and soprano of a four-voice choral and track the melodic motions of the bass, the ``score`` argument should be a 2-item list containing the IntervalIndexer results dataframe and the :class:`HorizontalIntervalIndexer` dataframe. Note that the :class:`HorizontalIntervalIndexer` results must have been calculated with the ``'horiz_attach_later'`` setting set to ``True`` (this is in order to avoid an indexing nightmare). The settings dictionary to pass to this indexer would be: >>> settings = { 'n': 4, 'vertical': [('0,3',)], 'horizontal': [('3',)] } If you want to get 'figured-bass' 2-gram output from this same 4-voice choral, use the same 2-item list for the score argument, and then put all of the voice pairs that sound against the bass in the same tuple in the vertical setting. Here's what the settings should be: >>> settings = { 'n': 2, 'vertical': [('0,3', '1,3', '2,3')], 'horizontal': [('3',)] } In the example above, if you wanted stacks of vertical events without the horizontal connecting events, you would just omit the ``'horizontal'`` setting from the settings dictionary and also only include the vertical observations in the ``score`` list of dataframes. If instead you want to look at all the pairs of voices in the 4-voice piece, and always track the melodic motions of the lowest voice in that pair, then put each pair in a different tuple, and in the voice to track melodically in the corresponding tuple in the horizontal list. Since there are 6 pairs of voices in a 4-voice piece, both your vertical and horizontal settings should be a list of six tuples. This will cause the resulting n-gram results dataframe to have six columns of observations. Your settings should look like this: >>> settings = { 'n': 2, 'vertical': [ ('0,1',), ('0,2',), ('0,3',), ('1,2',), ('1,3',), ('2,3') ], 'horizontal': [ ('1',), ('2',), ('3',), ('2',), ('3',), ('3',) ] } Since we often want to look at all the pairs of voices in a piece, you can set the ``'vertical'`` setting to ``'all'`` and this will get all the column names from the first dataframe in the score list of dataframes. Similarly, as we often want to always track the melodic motions of the lowest or highest voice in the vertical groups, the horizontal setting can be set to ``'highest'`` or ``'lowest'`` to automate this voice selection. This means that the preceeding query can also be accomplished with the following settings: >>> settings = { 'n': 2, 'vertical': 'all', 'horizontal': 'lowest' } The ``'brackets'`` setting will set off all the vertical events at each time point in square brackets '[]' and horizontal observations will appear in parentheses '()'. This is particularly useful if there are multiple observations in each vertical or horizontal slice. For example, if we wanted to redo the query above where n = 4, but this time tracking the melodic motions of both the upper and the lower voice, it would be a good idea to set 'brackets' to ``True`` to make the results easier to read. The settings would look like this: >>> settings = { 'n': 4, 'vertical': [('0,3',)], 'horizontal': [('0', '3',)], 'brackets': True } If you want n-grams to terminate when finding one or several particular values, you can specify this by passing a list of strings as the ``'terminator'`` setting. To show that a horizontal event continues, we use ``'_'`` by default, but you can set this separately, for example to ``'P1'`` ``'0'``, as seems appropriate. Once you've chosen the appropriate settings, to actually run the indexer call it like this: **Example:** >>> from vis.models.indexed_piece import Importer >>> ip = Importer('pathnameToScore.xml') >>> ngram_settings = { 'n': 2, 'vertical': 'all', 'horizontal': 'lowest' } >>> vert_settings = { 'quality': 'chromatic', 'simple or compound': 'simple', 'directed': True } >>> horiz_settings = { 'quality': 'diatonic with quality', 'simple or compound': 'simple', 'directed': True, 'horiz_attach_later': True } >>> vert_ints = ip.get_data('vertical_interval', settings=vert_settings) >>> horiz_ints = ip.get_data('horizontal_interval', settings=horiz_settings) >>> ip.get_data('ngram', data=[vert_ints, horiz_ints], settings=ngram_settings) """ required_score_type = 'pandas.DataFrame' possible_settings = [ 'horizontal', 'vertical', 'n', 'open-ended', 'brackets', 'terminator', 'continuer', 'align' ] """ A list of possible settings for the :class:`NGramIndexer`. :keyword 'horizontal': Selectors for the columns to consider as "horizontal." :type 'horizontal': list of tuples of strings, default []. :keyword 'vertical': Selectors for the column names to consider as "vertical." :type 'vertical': list of tuples of strings, default 'all'. :keyword 'n': The number of "vertical" events per n-gram. :type 'n': int :keyword 'open-ended': Appends the next horizontal observation to n-grams leaving them open-ended. :type 'open-ended': boolean, default ``False``. :keyword 'brackets': Whether to use delimiters around event observations. Square brakets [] are used to set off vertical events and round brackets () are used to set off horizontal events. This is particularly important to leave as ``True`` (default) for better legibility when there are multiple vertical or multiple horizontal observations at each slice. :type 'brackets': bool, default True. :keyword 'terminator': Do not find an n-gram with a vertical item that contains any of these values. :type 'terminator': list of str, default []. :keyword 'continuer': When there is no "horizontal" event that corresponds to a vertical event, this is printed instead, to show that the previous "horizontal" event continues. :type 'continuer': str, default '_'. """ default_settings = { 'brackets': True, 'horizontal': [], 'open-ended': False, 'terminator': [], 'vertical': 'all', 'continuer': '_', 'align': 'left' } _MISSING_SETTINGS = ("NGramIndexer requires 'vertical' and 'n' " + "settings.") _MISSING_HORIZONTAL_SETTING = ("If you provide a list of two " + "DataFrames as the score, you must also specify the columns " + "to examine in the second DataFrame with the 'horizontal' " + "setting.") _MISSING_HORIZONTAL_DATA = ("NGramIndexer needs a dataframe of " + "horizontal observations if you want to include a horizontal " + "dimension in your ngrams.") _SUPERFLUOUS_HORIZONTAL_DATA = ("If n is set to 1 and the " + "'open_ended' setting is set to False, no horizontal " + "observations will be included in ngrams so you should leave " + "the 'horizontal' setting blank.") _HORIZONTAL_OUT_OF_RANGE = ("Not all of the specified " + "'horizontal' columns are in the DataFrame of horizontal " + "observations. If you're doing a query on multiple pieces, " + "it can be convenient to pass 'all' as the 'horizontal' " + "setting which dynamically selects all of the columns of the " + "DataFrame of horizontal observations.") _VERTICAL_OUT_OF_RANGE = ("Not all of the specified 'vertical' " + "columns are in the DataFrame of vertical observations. If " + "you're doing a query on multiple pieces, it can be " + "convenient to pass 'all' as the 'vertical' setting which " + "dynamically selects all of the columns of the DataFrame of " + "vertical observations.") _N_VALUE_TOO_LOW = ("NGramIndexer requires an 'n' value of at " + "least 1.") _N_VALUE_TOO_HIGH = ("NGramIndexer is unlikely to return results " + "when the value of n is greater than the number of passed " + "observations in either of the passed dataframes.") _WRONG_ALIGN_SETTING = ("Incorrect 'align' setting passed. " + "Please use 'left', 'right', 'l', or 'r'.") def __init__(self, score, settings=None): """ :param score: The :class:`DataFrame` to use for preparing n-grams. You must ensure the :class:`DataFrame` has the columns indicated in the ``settings``, or the :meth:`run` method will fail. :type score: :class:`pandas.DataFrame` :param dict settings: Required and optional settings. See descriptions in :const:`possible_settings`. :raises: :exc:`RuntimeError` if ``score`` is the wrong type. :raises: :exc:`RuntimeError` if ``score`` is not a list of the same types. :raises: :exc:`RuntimeError` if required settings are not present in ``settings``. :raises: :exc:`RuntimeError` if ``'n'`` is less than ``1``. """ # Check all required settings are present in the "settings" argument. if (settings is None or 'vertical' not in settings or 'n' not in settings): raise RuntimeError(NGramIndexer._MISSING_SETTINGS) elif (settings['n'] < 1): raise RuntimeError(NGramIndexer._N_VALUE_TOO_LOW) else: self._settings = NGramIndexer.default_settings.copy() self._settings.update(settings) self._cut_off = self._settings['n'] if not self._settings['open-ended'] else self._settings['n'] + 1 if (all(self._cut_off > len(df) for df in score)): raise RuntimeWarning(NGramIndexer._N_VALUE_TOO_HIGH) super(NGramIndexer, self).__init__(score, None) self._vertical_indexer_name = self._score[0].columns[0][0] if self._settings['horizontal']: if len(self._score) != 2: raise RuntimeError(NGramIndexer._MISSING_HORIZONTAL_DATA) elif self._settings['n'] == 1 and not self._settings['open-ended']: raise RuntimeWarning(NGramIndexer._SUPERFLUOUS_HORIZONTAL_DATA) elif (self._settings['horizontal'] not in ('lowest', 'highest') and not all([col_name in self._score[1].columns.levels[1] for tup in settings['horizontal'] for col_name in tup])): raise RuntimeError(NGramIndexer._HORIZONTAL_OUT_OF_RANGE) self._horizontal_indexer_name = self._score[1].columns[0][0] elif len(self._score) != 1: # there is a df of horizontal observations, # but no horizontal columns specified in settings. raise RuntimeError(NGramIndexer._MISSING_HORIZONTAL_SETTING) if self._settings['vertical'] != 'all': if not all([col_name in self._score[0].columns.levels[1] for tup in settings['vertical'] for col_name in tup]): raise RuntimeError(NGramIndexer._VERTICAL_OUT_OF_RANGE) else: # i.e. self._settings['vertical'] == 'all' self._settings['vertical'] = [(x,) for x in self._score[0].columns.levels[1]] if self._settings['horizontal'] == 'lowest': temp = [list(map(int, x[0].split(','))) for x in self._settings['vertical']] self._settings['horizontal'] = [(str(max(y)),) for y in temp] elif self._settings['horizontal'] == 'highest': temp = [list(map(int, x[0].split(','))) for x in self._settings['vertical']] self._settings['horizontal'] = [(str(min(y)),) for y in temp] if self._settings['align'] not in ('left', 'right', 'L', 'R', 'l', 'r', 'Left', 'Right', 'LEFT', 'RIGHT'): raise RuntimeWarning(NGramIndexer._WRONG_ALIGN_SETTING)
[docs] def run(self): """ Make an index of k-part n-grams of anything. :returns: A new index of the piece in the form of a class:`~pandas.DataFrame` with as many columns as there are tuples in the 'vertical' setting of the passed settings. """ n = self._settings['n'] post = [] cols = [] # Each i in this loop will be a dataframe column of ngrams for a # voice combination passed by the user for i, verts in enumerate(self._settings['vertical']): events = {} col_label = [] if self._settings['brackets']: events[('v', 'v0')] = '[' for j, name in enumerate(verts): if j > 0: # add a space if it's a non-first observation events[('v', 'v' +str(j + .5))] = ' ' events[('v', 'v' + str(j + 1))] = self._score[0].loc[:, (self._vertical_indexer_name, name)].dropna() col_label.append(name) if self._settings['brackets']: events[('v', 'v' + str(len(verts) + 1))] = ']' # add a space after all vertical observations events[('v', 'v' + str(len(verts) + 1.5))] = ' ' if self._settings['horizontal']: # NB: the bool value of an empty list is False. horizs = self._settings['horizontal'][i] if self._settings['brackets']: events[('h', 'h0')] = '(' col_label.append(':') for j, name in enumerate(horizs): if (j > 0): # add a space if it's a non-first observation events[('h', 'h' + str(j + .5))] = ' ' events[('h', 'h' + str(j + 1))] = self._score[1].loc[:, (self._horizontal_indexer_name, name)].dropna() col_label.append(name) if self._settings['brackets']: events[('h', 'h' + str(len(horizs) + 1))] = ')' # add a space after all horizontal observations events[('h', 'h' + str(len(horizs) + 1.5))] = ' ' cols.append(' '.join(col_label)) events = pandas.DataFrame.from_dict(events) # Forward fill all the "vertical" events v_filled = events.loc[:, 'v'].fillna(method='ffill') # Fill in all "horizontal" NaN values with the continuer if 'h' in events: h_filled = events.loc[:, 'h'].fillna(value=self._settings['continuer']) ffilled_events = pandas.concat((h_filled, v_filled), axis=1) chunks = [v_filled] if n > 1: chunks.extend([ffilled_events.shift(-x) for x in range(1, n)]) # If there were no "horizontal" events set the chunks to the # vertical slices else: chunks = [v_filled.shift(-x) for x in range(n)] # Add a column of horizontal events if 'open-ended' setting # is True if self._settings['open-ended']: chunks.append(h_filled.shift(-n)) # Make a dataframe which each vertical or horizontal # component of the ngrams is a column ngram_df = pandas.concat(chunks, axis=1) # Apply the right alignment if the user asked for it. if (n > 1 and self._settings['align'] in ('right', 'Right', 'RIGHT', 'r', 'R')): new_index = ngram_df.index[n-1:] # It doesn't really matter what we put on the end # because this will get cut off anyway, # but the values do always have to increase. ngram_df.index = new_index.append(pandas.Index([new_index[-1] + x for x in range(1, n)])) # Get rid of the observations that contain any of the # terminators and trim the trailing rows that contain nans if self._settings['terminator']: ngram_df = ngram_df.replace(self._settings['terminator'], float('nan')).dropna() # if there are no terminators then we need to trim the # trailing rows that contain nans elif self._cut_off > 1: ngram_df = ngram_df.iloc[:(-self._cut_off + 1), :] # Try to concatenate strings of each row to turn df into a # series. If you encounter type other than string, first # convert the values to strings then do the concatenation. try: res = ngram_df.iloc[:, 0].str.cat([ngram_df.iloc[:, x] for x in range(1, len(ngram_df.columns))]) except AttributeError: ngram_df = ngram_df.applymap(str) res = ngram_df.iloc[:, 0].str.cat([ngram_df.iloc[:, x] for x in range(1, len(ngram_df.columns))]) # Get rid of the trailing space in each ngram and add this # combination to post post.append(res.str.rstrip()) return self.make_return(cols, post)