Text
Ekkart Kleinod
•
Auf dieser Seite
- bei allen Textoperationen Encoding explizit angeben:
utf_8
(utf8
,utf-8
) - Unicode-Sandwich
- Input in Unicode umwandeln
- Operationen auf Unicode
- Output aus Unicode umwandeln
with open('test.txt', 'w', encoding='utf_8') as fp: fp.write('café') with open('test.txt', 'r', encoding='utf_8') as fp: fp.read()
Kleinbuchstaben
str.casefold()
Unicode-Namen
unicodedata.name
import unicodedata print(unicodedata.name('\u2126')) print(unicodedata.normalize('NFC', unicodedata.name('\u2126'))) print(unicodedata.name('😸')) print(unicodedata.name('☲')) print(unicodedata.name('A'))
OHM SIGN
GREEK CAPITAL LETTER OMEGA
GRINNING CAT FACE WITH SMILING EYES
TRIGRAM FOR FIRE
LATIN CAPITAL LETTER A
Sortieren
pyuca
- benutzt nicht das aktuelle Locale
- also manchmal falsche Sortierung (Ä nach AE, statt AE, nach allem?)
import pyuca coll = pyuca.Collator() fruits = ['Birnen', 'Möhren', 'Äpfel'] sorted_fruits = sorted(fruits, coll.sort_key) print(sorted_fruits)
['Äpfel', 'Birnen', 'Möhren']
Sonderzeichen entfernen
Schwierig
- normalisieren
- Zeichen ersetzen
import unicodedata txt = 'ÄÖÜäaüß und so' norm_txt = unicodedata.normalize('NFC', txt) shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c)) norm_shaved = unicodedata.normalize('NFC', shaved)
Komplexer https://github.com/fluentpython/example-code-2e/blob/master/04-text-byte/simplify.py
import unicodedata import string def shave_marks(txt): """Remove all diacritic marks""" norm_txt = unicodedata.normalize('NFD', txt) shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c)) return unicodedata.normalize('NFC', shaved) def shave_marks_latin(txt): """Remove all diacritic marks from Latin base characters""" norm_txt = unicodedata.normalize('NFD', txt) latin_base = False preserve = [] for c in norm_txt: if unicodedata.combining(c) and latin_base: continue # ignore diacritic on Latin base char preserve.append(c) # if it isn't a combining char, it's a new base char if not unicodedata.combining(c): latin_base = c in string.ascii_letters shaved = ''.join(preserve) return unicodedata.normalize('NFC', shaved) single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""", """'f"^<''""---~>""") multi_map = str.maketrans({ '€': 'EUR', '…': '...', 'Æ': 'AE', 'æ': 'ae', 'Œ': 'OE', 'œ': 'oe', '™': '(TM)', '‰': '<per mille>', '†': '**', '‡': '***', }) multi_map.update(single_map) def dewinize(txt): """Replace Win1252 symbols with ASCII chars or sequences""" return txt.translate(multi_map) def asciize(txt): no_marks = shave_marks_latin(dewinize(txt)) no_marks = no_marks.replace('ß', 'ss') return unicodedata.normalize('NFKC', no_marks)