đŸ› ïžToolsShed

Unicode-Text-Normalisierer

Unicode-Text mit NFC, NFD, NFKC oder NFKD normalisieren.

Kanonische Zerlegung, dann kanonische Komposition

HĂ€ufig gestellte Fragen

Code-Implementierung

import unicodedata

text = "e\u0301"  # 'e' + combining acute accent (looks like 'Ă©')

# NFC: Canonical Decomposition, then Canonical Composition
nfc = unicodedata.normalize("NFC", text)
print(f"NFC:  {nfc!r}  len={len(nfc)}")   # 'Ă©'  len=1

# NFD: Canonical Decomposition
nfd = unicodedata.normalize("NFD", text)
print(f"NFD:  {nfd!r}  len={len(nfd)}")   # 'Ă©'  len=2

# NFKC: Compatibility Decomposition, then Canonical Composition
full_width = "\uff41\uff42\uff43"  #  full-width
nfkc = unicodedata.normalize("NFKC", full_width)
print(f"NFKC: {nfkc!r}")  # 'abc'

# NFKD: Compatibility Decomposition
nfkd = unicodedata.normalize("NFKD", full_width)
print(f"NFKD: {nfkd!r}")  # 'abc'

# Check if two strings are canonically equivalent
def canon_equal(a: str, b: str) -> bool:
    return unicodedata.normalize("NFC", a) == unicodedata.normalize("NFC", b)

print(canon_equal("Ă©", "e\u0301"))  # True

Comments & Feedback

Comments are powered by Giscus. Sign in with GitHub to leave a comment.