Source code for ham

"""
Ham is a toolkit for manipulating words, pronunciations, and phonemes.

.. testsetup::

    from ham import Pronunciation, Word
"""
from ._version import __version__
from .symbols import *

FILL_CHAR = '.'


class NLTKModuleNotFound(Exception):
    pass


class CMUDictCache(object):
    _has_nltk = None
    _cmudict = None

    @property
    def has_nltk(self):
        if self._has_nltk is None:
            try:
                import nltk
            except ImportError:
                self._has_nltk = False
            else:
                self._has_nltk = True
        return self._has_nltk

    @property
    def cmudict(self):
        if not self.has_nltk:
            raise NLTKModuleNotFound
        if self._cmudict is None:
            import nltk.corpus
            self._cmudict = nltk.corpus.cmudict.dict()
        return self._cmudict

cache = CMUDictCache()


[docs]class Word(object):
    """
    Encapsulates a word.
    """

    def __init__(self, word):
        self._letters = list(word)

    def __str__(self):
        return ''.join(self._letters)

    def __repr__(self):
        return '<Word "{0!s}">'.format(self)

    def __iter__(self):
        return iter(self._letters)

    def __eq__(self, other):
        return (type(self) is type(other) and
                self._letters == other._letters)

    def __ne__(self, other):
        return not self.__eq__(other)

    def __contains__(self, obj):
        return obj in str(self)

    def __len__(self):
        return len(str(self).replace(FILL_CHAR, ''))

[docs]    def pop(self, seq):
        """
        Return and remove a letter or sequence of letters.

        .. doctest::

            >>> hello = Word('hello')
            >>> hello.pop('el')
            'el'
            >>> str(hello)
            'h..lo'

        If seq does not exist, raise a ValueError.
        """
        if seq not in self:
            raise ValueError('"{0!s}" is not in word'.format(seq))
        index = str(self).find(seq)
        s = slice(index, index + len(seq))
        chunk = self._letters[s]
        self._letters[s] = FILL_CHAR * len(seq)
        return ''.join(chunk)

[docs]    def vowel_groups(self):
        """
        Generator that yields consecutive groups of vowels.

        .. doctest::

            >>> list(Word('onomatopoeia').vowel_groups())
            ['o', 'o', 'a', 'o', 'oeia']
        """
        vowels = 'aeiouy'
        word = str(self)
        acc = []
        for letter in word:
            if letter in vowels:
                acc.append(letter)
            else:
                if acc:
                    yield ''.join(acc)
                    acc = []
        if acc:
            yield ''.join(acc)

[docs]    def pronunciations(self):
        """
        Returns a list of Pronunciations for the word.

        Looks the word up in the CMU pronouncing dictionary
        (http://www.speech.cs.cmu.edu/cgi-bin/cmudict)

        If the word does not exist in the pronouncing dictionary,
        return an empty list.

        .. doctest::

            >>> Word('hungry').pronunciations()
            [<Pronunciation "HH AH1 NG G R IY0">]
            >>> Word('chewbacca').pronunciations()
            []
        """
        try:
            pronunciations = cache.cmudict[str(self)]
        except KeyError:
            return []
        return [Pronunciation(p) for p in pronunciations]


[docs]class Pronunciation(object):
    """
    A wrapper around a list of phonemes.
    """

    def __init__(self, phonemes):
        self._phonemes = list(phonemes)

[docs]    def __contains__(self, obj):
        """
        Checks if a phoneme is in this pronunciation.

        If the phoneme is an unstressed vowel, that phoneme will be compared
        against contained vowels without regard to stress. Otherwise, if the
        phoneme is a stressed vowel sound, the stress will be taken into
        account.

        .. doctest::

            >>> 'AA' in Pronunciation(['B', 'AA1', 'R', 'N'])
            True

            >>> 'AW0' in Pronunciation(['B', 'R', 'AW1', 'N'])
            False
        """
        try:
            last_item = obj[-1]
        except TypeError:
            return False
        if last_item in '012':
            return obj in self._phonemes
        else:
            for phoneme in self._phonemes:
                if obj in phoneme:
                    return True
            return False

    def __str__(self):
        return ' '.join(self._phonemes)

    def __repr__(self):
        return '<Pronunciation "{0}">'.format(str(self))

    def __iter__(self):
        return iter(self._phonemes)

    def __len__(self):
        return len(self._phonemes)

    def __eq__(self, other):
        return (type(self) is type(other) and
                self._phonemes == other._phonemes)

    def __ne__(self, other):
        return not self.__eq__(other)

[docs]    def index(self, value, start=0):
        """
        Return the first index of value, starting at the start index given.

        If an unstressed vowel is given, will return the first index with that
        vowel sound regardless of stress. Otherwise index will only find a
        vowel sound with identical stress.

        .. doctest::

            >>> Pronunciation(['B', 'AA1', 'R', 'N']).index('AA')
            1
        """
        for i, phoneme in enumerate(self._phonemes):
            if i < start:
                continue
            if value in phoneme:
                return i
        raise ValueError("'{0!s}' is not in pronunciation".format(value))


[docs]class SoundPairing(object):
    """
    A SoundPairing is for mapping a word with a pronunciation. It is used as an
    intermediate step for breaking a word into its phonograms.
    """

    def __init__(self, word, pronunciation):
        self.word = Word(word)
        self.pronunciation = Pronunciation(pronunciation)
        self.phonograms = [''] * len(self.pronunciation)
Source code for ham

Project Versions

This Page

Navigation

Source code for ham

Project Versions

RTD Search

This Page

Quick search

Navigation