Source code for ham

"""
Ham is a toolkit for manipulating words, pronunciations, and phonemes.

.. testsetup::

    from ham import Pronunciation, Word
"""
from ._version import __version__
from .symbols import *

FILL_CHAR = '.'


class NLTKModuleNotFound(Exception):
    pass


class CMUDictCache(object):
    _has_nltk = None
    _cmudict = None

    @property
    def has_nltk(self):
        if self._has_nltk is None:
            try:
                import nltk
            except ImportError:
                self._has_nltk = False
            else:
                self._has_nltk = True
        return self._has_nltk

    @property
    def cmudict(self):
        if not self.has_nltk:
            raise NLTKModuleNotFound
        if self._cmudict is None:
            import nltk.corpus
            self._cmudict = nltk.corpus.cmudict.dict()
        return self._cmudict

cache = CMUDictCache()


[docs]class Word(object): """ Encapsulates a word. """ def __init__(self, word): self._letters = list(word) def __str__(self): return ''.join(self._letters) def __repr__(self): return '<Word "{0!s}">'.format(self) def __iter__(self): return iter(self._letters) def __eq__(self, other): return (type(self) is type(other) and self._letters == other._letters) def __ne__(self, other): return not self.__eq__(other) def __contains__(self, obj): return obj in str(self) def __len__(self): return len(str(self).replace(FILL_CHAR, ''))
[docs] def pop(self, seq): """ Return and remove a letter or sequence of letters. .. doctest:: >>> hello = Word('hello') >>> hello.pop('el') 'el' >>> str(hello) 'h..lo' If seq does not exist, raise a ValueError. """ if seq not in self: raise ValueError('"{0!s}" is not in word'.format(seq)) index = str(self).find(seq) s = slice(index, index + len(seq)) chunk = self._letters[s] self._letters[s] = FILL_CHAR * len(seq) return ''.join(chunk)
[docs] def vowel_groups(self): """ Generator that yields consecutive groups of vowels. .. doctest:: >>> list(Word('onomatopoeia').vowel_groups()) ['o', 'o', 'a', 'o', 'oeia'] """ vowels = 'aeiouy' word = str(self) acc = [] for letter in word: if letter in vowels: acc.append(letter) else: if acc: yield ''.join(acc) acc = [] if acc: yield ''.join(acc)
[docs] def pronunciations(self): """ Returns a list of Pronunciations for the word. Looks the word up in the CMU pronouncing dictionary (http://www.speech.cs.cmu.edu/cgi-bin/cmudict) If the word does not exist in the pronouncing dictionary, return an empty list. .. doctest:: >>> Word('hungry').pronunciations() [<Pronunciation "HH AH1 NG G R IY0">] >>> Word('chewbacca').pronunciations() [] """ try: pronunciations = cache.cmudict[str(self)] except KeyError: return [] return [Pronunciation(p) for p in pronunciations]
[docs]class Pronunciation(object): """ A wrapper around a list of phonemes. """ def __init__(self, phonemes): self._phonemes = list(phonemes)
[docs] def __contains__(self, obj): """ Checks if a phoneme is in this pronunciation. If the phoneme is an unstressed vowel, that phoneme will be compared against contained vowels without regard to stress. Otherwise, if the phoneme is a stressed vowel sound, the stress will be taken into account. .. doctest:: >>> 'AA' in Pronunciation(['B', 'AA1', 'R', 'N']) True >>> 'AW0' in Pronunciation(['B', 'R', 'AW1', 'N']) False """ try: last_item = obj[-1] except TypeError: return False if last_item in '012': return obj in self._phonemes else: for phoneme in self._phonemes: if obj in phoneme: return True return False
def __str__(self): return ' '.join(self._phonemes) def __repr__(self): return '<Pronunciation "{0}">'.format(str(self)) def __iter__(self): return iter(self._phonemes) def __len__(self): return len(self._phonemes) def __eq__(self, other): return (type(self) is type(other) and self._phonemes == other._phonemes) def __ne__(self, other): return not self.__eq__(other)
[docs] def index(self, value, start=0): """ Return the first index of value, starting at the start index given. If an unstressed vowel is given, will return the first index with that vowel sound regardless of stress. Otherwise index will only find a vowel sound with identical stress. .. doctest:: >>> Pronunciation(['B', 'AA1', 'R', 'N']).index('AA') 1 """ for i, phoneme in enumerate(self._phonemes): if i < start: continue if value in phoneme: return i raise ValueError("'{0!s}' is not in pronunciation".format(value))
[docs]class SoundPairing(object): """ A SoundPairing is for mapping a word with a pronunciation. It is used as an intermediate step for breaking a word into its phonograms. """ def __init__(self, word, pronunciation): self.word = Word(word) self.pronunciation = Pronunciation(pronunciation) self.phonograms = [''] * len(self.pronunciation)

Project Versions

This Page