Text

Ekkart Kleinod  • 
  • bei allen Textoperationen Encoding explizit angeben: utf_8 (utf8, utf-8)
  • Unicode-Sandwich
    • Input in Unicode umwandeln
    • Operationen auf Unicode
    • Output aus Unicode umwandeln
with open('test.txt', 'w', encoding='utf_8') as fp:
    fp.write('café')
with open('test.txt', 'r', encoding='utf_8') as fp:
    fp.read()

Kleinbuchstaben

str.casefold()

Unicode-Namen

unicodedata.name

import unicodedata

print(unicodedata.name('\u2126'))
print(unicodedata.normalize('NFC', unicodedata.name('\u2126')))
print(unicodedata.name('😸'))
print(unicodedata.name('☲'))
print(unicodedata.name('A'))
OHM SIGN
GREEK CAPITAL LETTER OMEGA
GRINNING CAT FACE WITH SMILING EYES
TRIGRAM FOR FIRE
LATIN CAPITAL LETTER A

Sortieren

pyuca

  • benutzt nicht das aktuelle Locale
  • also manchmal falsche Sortierung (Ä nach AE, statt AE, nach allem?)
import pyuca

coll = pyuca.Collator()
fruits = ['Birnen', 'Möhren', 'Äpfel']
sorted_fruits = sorted(fruits, coll.sort_key)
print(sorted_fruits)
['Äpfel', 'Birnen', 'Möhren']

Sonderzeichen entfernen

Schwierig

  1. normalisieren
  2. Zeichen ersetzen
import unicodedata

txt = 'ÄÖÜäaüß und so'
norm_txt = unicodedata.normalize('NFC', txt)
shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
norm_shaved = unicodedata.normalize('NFC', shaved)

Komplexer https://github.com/fluentpython/example-code-2e/blob/master/04-text-byte/simplify.py

import unicodedata
import string

def shave_marks(txt):
    """Remove all diacritic marks"""
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt
                     if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    preserve = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue  # ignore diacritic on Latin base char
        preserve.append(c)
        # if it isn't a combining char, it's a new base char
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(preserve)
    return unicodedata.normalize('NFC', shaved)

single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""",
                           """'f"^<''""---~>""")

multi_map = str.maketrans({
    '€': 'EUR',
    '…': '...',
    'Æ': 'AE',
    'æ': 'ae',
    'Œ': 'OE',
    'œ': 'oe',
    '™': '(TM)',
    '‰': '<per mille>',
    '†': '**',
    '‡': '***',
})

multi_map.update(single_map)

def dewinize(txt):
    """Replace Win1252 symbols with ASCII chars or sequences"""
    return txt.translate(multi_map)

def asciize(txt):
    no_marks = shave_marks_latin(dewinize(txt))
    no_marks = no_marks.replace('ß', 'ss')
    return unicodedata.normalize('NFKC', no_marks)