Source code for disamby.preprocessors

# -*- coding: utf-8 -*-

"""
This module contains the various string preprocessors
"""
import re

__all__ = ['compact_abbreviations', 'normalize_whitespace',
           'ngram', 'trigram', 'split_words', 'remove_punctuation',
           'nword']


_re_abbreviation = re.compile(r'\.(?![a-zA-Z]*})')
_re_duplicate_white = re.compile('\s+')
_re_whitespace = re.compile('\s')
_re_punctuation = re.compile('[^\w\s]')


[docs]def normalize_whitespace(string: str) -> str:
    """
    removes duplicates whitespaces as well as replace tabs and newlines with a space

    Parameters
    ----------
    string

    Returns
    -------
    str

    Examples
    --------
    >>> normalize_whitespace('this is a  \t long  string')
    'THIS IS A LONG STRING'
    """
    no_whitespace = _re_duplicate_white.sub(' ', string.upper())
    return no_whitespace.strip()


[docs]def compact_abbreviations(string: str) -> str:
    """
    Removes dots between single letters and concatenates them

    Parameters
    ----------
    string

    Returns
    -------
    str

    Examples
    --------
    >>> compact_abbreviations('an other A.B.M this')
    'AN OTHER ABM THIS'
    """
    split = _re_abbreviation.split(string.upper())
    return ''.join(split)


[docs]def remove_punctuation(word: str) -> str:
    """
    removes all punctuation symbols from the string

    Parameters
    ----------
    word: str

    Returns
    -------
    str

    Examples
    --------
    >>> remove_punctuation('.has -a .few!')
    'has a few'
    """
    return _re_punctuation.sub('', word)


[docs]def ngram(string: str, n: int) -> tuple:
    """
    constructs all possible ngrams from the given string. If the string is shorter then
    the n then the string is returned

    Parameters
    ----------
    string
    n : int
        value must be larger at least 2

    Returns
    -------
    tuple of strings

    Examples
    --------
    >>> ngram('this', 2)
    ('th', 'hi', 'is')
    """
    N = len(string)
    if n > N:
        return string,
    if n < 2:
        raise ValueError('n for a ngram must be 2 or larger')
    return tuple(string[i:i+n] for i in range(N-n+1))


[docs]def trigram(string: str) -> tuple:
    return ngram(string, 3)


[docs]def split_words(string: str) -> tuple:
    """
    splits words on whitespace. This function is more reliable then `.split(' ')`
    since it works with any whitespace character (i.e. those recognized by regex)

    Parameters
    ----------
    string

    Returns
    -------
    tuple of strings

    Examples
    --------
    >>> len(split_words('a new day'))
    3
    """
    return tuple(_re_whitespace.split(string))


[docs]def nword(word: str, k: int) -> tuple:
    """
    concatenates k consecutive words into a tuple

    Parameters
    ----------
    word
    k

    Returns
    -------
    tuple of strings

    Examples
    --------
    >>> nword('this that the other', 2)
    ('thisthat', 'thatthe', 'theother')
    """
    parts = split_words(word)
    n = len(parts)
    k = min(k, n)
    sequences = tuple(''.join(parts[i:i + k]) for i in range(n - k + 1))
    return sequences