A couple of days ago, I worked with an ugly unicode text, kind of like this:
ⒶⓀⓋⓉⒺ b̼̘̬ͭ͂̈́̀l͇͉̱͚̲̗̗͞a̱̭̬͎͉̤ͨ͂̌̑̓͂͐h̬̯̻̩͗ͩͯḅ̢̬͕͈̥̅̌͆̔̉ͅḽ̘̖̼͚́͒̈́̏͌̃͟ ͎̮̫̍ͫ̽͐͋ͤ͂a̜͔̩͇̩̪͐̍̐̃ͤ͑ ̦̌ḧ̙̝͓̜͕̝̈́ͅb̛̞͔̽̃̍ͪla̘̠͖͍̣͙̝͌ͪ͒̃ͯ ͗͛̆͊.̛̭̜̞̲͓̯ͧ̅ĥ͂͑/̢̊/̠̘͖͖̖̺̯
And I need to get a normal text from this shit, because MySQL has weird unicode support. So I made these simple functions for cleaning up unicode text:
def strip_accents(s):
""" Strip accents from a string """
result = []
for char in s:
# Pass these symbols without processing
if char in [u'й', u'Й', u'\n']:
result.append(char)
continue
for c in unicodedata.normalize('NFD', char):
if unicodedata.category(c) == 'Mn':
continue
result.append(c)
return ''.join(result)
def strip_symbols(s):
""" Strip ugly unicode symbols from a string """
result = []
for char in s:
# Pass these symbols without processing
if char in [u'й', u'Й', u'\n']:
result.append(char)
continue
for c in unicodedata.normalize('NFKC', char):
if unicodedata.category(c) == 'Zs':
result.append(u' ')
continue
if unicodedata.category(c) not in ['So', 'Mn',
'Lo', 'Cn', 'Co', 'Cf', 'Cc']:
result.append(c)
return u"".join(result)