# -*- coding: utf-8 -*-
"""
This module contains the various string preprocessors
"""
import re
__all__ = ['compact_abbreviations', 'normalize_whitespace',
'ngram', 'trigram', 'split_words', 'remove_punctuation',
'nword']
_re_abbreviation = re.compile(r'\.(?![a-zA-Z]*})')
_re_duplicate_white = re.compile('\s+')
_re_whitespace = re.compile('\s')
_re_punctuation = re.compile('[^\w\s]')
[docs]def normalize_whitespace(string: str) -> str:
"""
removes duplicates whitespaces as well as replace tabs and newlines with a space
Parameters
----------
string
Returns
-------
str
Examples
--------
>>> normalize_whitespace('this is a \t long string')
'THIS IS A LONG STRING'
"""
no_whitespace = _re_duplicate_white.sub(' ', string.upper())
return no_whitespace.strip()
[docs]def compact_abbreviations(string: str) -> str:
"""
Removes dots between single letters and concatenates them
Parameters
----------
string
Returns
-------
str
Examples
--------
>>> compact_abbreviations('an other A.B.M this')
'AN OTHER ABM THIS'
"""
split = _re_abbreviation.split(string.upper())
return ''.join(split)
[docs]def remove_punctuation(word: str) -> str:
"""
removes all punctuation symbols from the string
Parameters
----------
word: str
Returns
-------
str
Examples
--------
>>> remove_punctuation('.has -a .few!')
'has a few'
"""
return _re_punctuation.sub('', word)
[docs]def ngram(string: str, n: int) -> tuple:
"""
constructs all possible ngrams from the given string. If the string is shorter then
the n then the string is returned
Parameters
----------
string
n : int
value must be larger at least 2
Returns
-------
tuple of strings
Examples
--------
>>> ngram('this', 2)
('th', 'hi', 'is')
"""
N = len(string)
if n > N:
return string,
if n < 2:
raise ValueError('n for a ngram must be 2 or larger')
return tuple(string[i:i+n] for i in range(N-n+1))
[docs]def trigram(string: str) -> tuple:
return ngram(string, 3)
[docs]def split_words(string: str) -> tuple:
"""
splits words on whitespace. This function is more reliable then `.split(' ')`
since it works with any whitespace character (i.e. those recognized by regex)
Parameters
----------
string
Returns
-------
tuple of strings
Examples
--------
>>> len(split_words('a new day'))
3
"""
return tuple(_re_whitespace.split(string))
[docs]def nword(word: str, k: int) -> tuple:
"""
concatenates k consecutive words into a tuple
Parameters
----------
word
k
Returns
-------
tuple of strings
Examples
--------
>>> nword('this that the other', 2)
('thisthat', 'thatthe', 'theother')
"""
parts = split_words(word)
n = len(parts)
k = min(k, n)
sequences = tuple(''.join(parts[i:i + k]) for i in range(n - k + 1))
return sequences