Normalizador de Texto Unicode
Normaliza texto Unicode con NFC, NFD, NFKC o NFKD.
Descomposición canónica y luego composición canónica
Preguntas Frecuentes
Implementación de Código
import unicodedata
text = "e\u0301" # 'e' + combining acute accent (looks like 'é')
# NFC: Canonical Decomposition, then Canonical Composition
nfc = unicodedata.normalize("NFC", text)
print(f"NFC: {nfc!r} len={len(nfc)}") # 'é' len=1
# NFD: Canonical Decomposition
nfd = unicodedata.normalize("NFD", text)
print(f"NFD: {nfd!r} len={len(nfd)}") # 'é' len=2
# NFKC: Compatibility Decomposition, then Canonical Composition
full_width = "\uff41\uff42\uff43" # abc full-width
nfkc = unicodedata.normalize("NFKC", full_width)
print(f"NFKC: {nfkc!r}") # 'abc'
# NFKD: Compatibility Decomposition
nfkd = unicodedata.normalize("NFKD", full_width)
print(f"NFKD: {nfkd!r}") # 'abc'
# Check if two strings are canonically equivalent
def canon_equal(a: str, b: str) -> bool:
return unicodedata.normalize("NFC", a) == unicodedata.normalize("NFC", b)
print(canon_equal("é", "e\u0301")) # TrueComments & Feedback
Comments are powered by Giscus. Sign in with GitHub to leave a comment.