The Developer's Unicode Handbook · الفصل 8
Testing Unicode
A comprehensive guide to writing robust Unicode tests: edge case characters, boundary tests, fuzzing with Unicode data, and pytest fixtures for internationalized applications.
Most test suites are silently broken for Unicode. They test with ASCII, pass with flying colors, and then fail in production when a German user types their name or a Japanese user submits a form. Building a comprehensive Unicode test corpus isn't glamorous work, but it prevents the class of bugs that only appears after you launch in a new market. This chapter provides concrete test cases and strategies for every Unicode edge case that matters.
The Essential Unicode Test Corpus
Every test suite that processes text should include all of these strings. They are the "minimum viable Unicode corpus" — failing any of these represents a real bug that real users will hit.
# unicode_test_corpus.py
# Copy this into your test suite
UNICODE_TEST_STRINGS = {
# ASCII baseline
"ascii_simple": "Hello, World!",
"ascii_special": "Hello<World>&\"'",
# Diacritics (The Häagen-Dazs Test)
"latin_diacritics": "Häagen-Dazs",
"french_accents": "café au lait, résumé, naïve",
"german_umlauts": "Üniversität Zürich",
"nordic": "Ångström, Øresund, Åland",
# NFC vs NFD (same visual, different bytes)
"nfc_e_acute": "caf\\u00E9", # precomposed é
"nfd_e_acute": "cafe\\u0301", # e + combining acute
# CJK (The さくら Test)
"japanese_hiragana": "さくら",
"japanese_kanji": "東京都",
"chinese_simplified": "北京大学",
"chinese_traditional": "臺灣",
"korean_hangul": "서울시",
# RTL (The مرحبا Test)
"arabic": "\\u0645\\u0631\\u062D\\u0628\\u0627", # مرحبا
"hebrew": "\\u05E9\\u05DC\\u05D5\\u05DD", # שלום
"mixed_rtl_ltr": "Hello \\u0645\\u0631\\u062D\\u0628\\u0627 World",
# Emoji (The 👨👩👧👦 Test)
"simple_emoji": "Hello \\U0001F600", # 😀
"emoji_sequence": "\\U0001F1FA\\U0001F1F8", # 🇺🇸 (flag via regional indicators)
"zwj_family": "\\U0001F468\\u200D\\U0001F469\\u200D\\U0001F467\\u200D\\U0001F466",
"skin_tone_modifier": "\\U0001F44B\\U0001F3FD", # 👋🏽
# Normalization edge cases (The café Test)
"cafe_nfc": "caf\\u00E9",
"cafe_nfd": "cafe\\u0301",
# Case folding (The ΣΣΣ Test — Greek final sigma)
"greek_sigma": "\\u03A3\\u03C3\\u03C2", # ΣσΣ (upper, lower, final)
"german_sharp_s": "Stra\\u00DFe", # Straße
"turkish_dotted_i": "\\u0130stanbul", # İstanbul
# Zero-width characters
"zero_width_space": "Hello\\u200BWorld", # invisible space between words
"zero_width_joiner": "\\u0041\\u200D\\u0042", # A + ZWJ + B
"bom": "\\uFEFFHello", # BOM at start
# Long strings
"long_ascii": "a" * 10000,
"long_unicode": "\\u00E9" * 5000, # é × 5000
"long_emoji": "\\U0001F600" * 1000, # 😀 × 1000 (each is surrogate pair!)
# Empty and whitespace
"empty": "",
"space_only": " ",
"unicode_space": "\\u00A0\\u2003\\u2009", # NBSP, EM SPACE, THIN SPACE
"newlines": "line1\\nline2\\r\\nline3\\u2028line4", # LS (Line Separator)
# Special cases
"null_byte": "hell\\x00world",
"lone_surrogate": "\\uD800", # Unpaired surrogate (invalid in most contexts)
"noncharacter": "\\uFFFE", # Unicode noncharacter
"pua": "\\uE000", # Private Use Area character
"math_alphanumeric": "\\U0001D400", # Mathematical Bold A (NFKC → A)
}
Writing Parametrized Unicode Tests
Use parametrize to run the same logic over all corpus strings. This ensures your functions handle every edge case:
import pytest
import unicodedata
from your_app.utils import normalize_text, truncate_text, count_characters
# Test normalization idempotency
@pytest.mark.parametrize("text", list(UNICODE_TEST_STRINGS.values()))
def test_normalize_is_idempotent(text: str) -> None:
# Normalizing twice should give the same result as normalizing once.
normalized_once = normalize_text(text)
normalized_twice = normalize_text(normalized_once)
assert normalized_once == normalized_twice
# Test truncation doesn't break multibyte sequences
@pytest.mark.parametrize("text", list(UNICODE_TEST_STRINGS.values()))
def test_truncate_produces_valid_unicode(text: str) -> None:
# Truncated strings should be valid Unicode (no broken surrogates).
for length in [1, 5, 10, 50]:
result = truncate_text(text, length)
# Should not raise UnicodeDecodeError
result.encode("utf-8")
# Length should not exceed requested
assert count_characters(result) <= length
# Test round-trip encoding
@pytest.mark.parametrize("text", list(UNICODE_TEST_STRINGS.values()))
@pytest.mark.parametrize("encoding", ["utf-8", "utf-16", "utf-32"])
def test_encoding_roundtrip(text: str, encoding: str) -> None:
# Text should survive encode/decode roundtrip for valid strings.
try:
encoded = text.encode(encoding)
decoded = encoded.decode(encoding)
# NFC normalize both sides for comparison
assert (unicodedata.normalize("NFC", decoded) ==
unicodedata.normalize("NFC", text))
except (UnicodeEncodeError, UnicodeDecodeError):
pytest.skip(f"String not encodable in {encoding} (lone surrogates, etc.)")
The Specific Named Tests
The Häagen-Dazs Test (Diacritics)
def test_haagen_dazs_stored_and_retrieved() -> None:
# Brand names with diacritics should survive database round-trips.
brand = "Häagen-Dazs"
saved_brand = save_to_db(brand)
retrieved = get_from_db(saved_brand.id)
assert retrieved.name == brand # Not "H?agen-Dazs" or "Haagen-Dazs"
def test_haagen_dazs_searchable() -> None:
# Searching 'haagen' should find 'Häagen-Dazs'.
create_brand("Häagen-Dazs")
results = search_brands("haagen")
assert len(results) == 1
assert results[0].name == "Häagen-Dazs"
The ZWJ Family Emoji Test
def test_family_emoji_length() -> None:
# ZWJ family emoji should count as 1 character in user-facing length.
import grapheme
family = "\\U0001F468\\u200D\\U0001F469\\u200D\\U0001F467\\u200D\\U0001F466"
# Wrong: len() gives 7
assert len(family) == 7
# Right: grapheme cluster count gives 1
assert grapheme.length(family) == 1
def test_family_emoji_truncation() -> None:
# Truncating at grapheme boundary should not split ZWJ sequences.
family = "\\U0001F468\\u200D\\U0001F469\\u200D\\U0001F467\\u200D\\U0001F466"
text = f"Hello {family}!" # 9 grapheme clusters
truncated = truncate_to_graphemes(text, 8)
# Should be "Hello 👨👩👧👦" (8 clusters: H,e,l,l,o, ,family, )
# NOT "Hello 👨" (split in middle of ZWJ sequence)
assert grapheme.length(truncated) == 8
# Truncated result should be valid UTF-8
truncated.encode("utf-8")
The Normalization Test (café)
def test_cafe_normalization_equality() -> None:
# NFC and NFD representations of café should compare equal after normalization.
cafe_nfc = "caf\\u00E9" # precomposed é
cafe_nfd = "cafe\\u0301" # e + combining acute
# Raw comparison fails — this is the bug we're testing for
assert cafe_nfc != cafe_nfd # They ARE different bytes
# Normalized comparison works
import unicodedata
assert (unicodedata.normalize("NFC", cafe_nfc) ==
unicodedata.normalize("NFC", cafe_nfd))
def test_cafe_stored_normalized() -> None:
# Regardless of input form, stored form should be normalized.
cafe_nfc = "caf\\u00E9"
cafe_nfd = "cafe\\u0301"
id1 = save_text(cafe_nfc)
id2 = save_text(cafe_nfd)
# Both should be stored in same normalized form
r1 = get_text(id1)
r2 = get_text(id2)
assert r1 == r2 # Would fail without normalization at write time
The Greek Sigma Test (ΣΣΣ)
def test_greek_final_sigma_casefold() -> None:
# Greek final sigma (ς) should casefold to ς, not σ.
# In Greek, sigma has two lowercase forms:
# σ (U+03C3) medial sigma — used mid-word
# ς (U+03C2) final sigma — used at end of word
# Uppercase Σ lowercases to σ (medial)
# But case folding handles word boundaries correctly
word = "\\u0391\\u03B8\\u03AE\\u03BD\\u03B1" # Αθήνα (Athens)
assert word.lower() == "αθήνα"
# casefold handles ß → ss correctly
assert "STRASSE".casefold() == "strasse"
assert "Stra\\u00DFe".casefold() == "strasse"
# Critical: casefold for case-insensitive comparison
assert "Stra\\u00DFe".casefold() == "STRASSE".casefold()
Property-Based Testing with Hypothesis
Hypothesis generates Unicode strings automatically, including edge cases you wouldn't think to include:
from hypothesis import given, settings
from hypothesis import strategies as st
# Test that your normalize function handles all valid Unicode
@given(st.text())
def test_normalize_never_crashes(text: str) -> None:
# normalize_text should never raise an exception.
result = normalize_text(text)
assert isinstance(result, str)
# Test that encoding is always possible after normalization
@given(st.text())
def test_normalized_text_is_utf8_encodable(text: str) -> None:
# Normalized text should always be encodable as UTF-8.
import unicodedata
normalized = unicodedata.normalize("NFC", text)
# NFC-normalized text should encode without errors
# (except for lone surrogates, which Hypothesis can generate)
try:
normalized.encode("utf-8")
except UnicodeEncodeError:
# Only valid failure: lone surrogates (invalid in NFC)
assert any(
unicodedata.category(c) == "Cs" # Surrogate category
for c in normalized
)
# Test that truncation never produces longer strings
@given(st.text(), st.integers(min_value=0, max_value=100))
def test_truncation_never_exceeds_max(text: str, max_len: int) -> None:
import grapheme
result = truncate_to_graphemes(text, max_len)
assert grapheme.length(result) <= max_len
Fuzzing for Unicode Security Issues
For security-sensitive code, use fuzzing to find Unicode injection vectors:
# Generate adversarial Unicode inputs
import random
import unicodedata
ADVERSARIAL_CHARS = [
"\\u202E", # RTL override
"\\u200B", # Zero width space
"\\uFEFF", # BOM
"\\u0000", # Null byte
"\\uFFFE", # Noncharacter
"\\uD800", # Lone surrogate
"\\u2028", # Line separator (JS newline)
"\\u2029", # Paragraph separator
"\\u0340", # Deprecated combining grave (NFC-normalizes)
]
def generate_adversarial_inputs(base: str, count: int = 100) -> list[str]:
# Generate Unicode adversarial variants of a base string.
inputs = [base]
# Insert adversarial chars at random positions
for _ in range(count):
chars = list(base)
position = random.randint(0, len(chars))
adversarial_char = random.choice(ADVERSARIAL_CHARS)
chars.insert(position, adversarial_char)
inputs.append("".join(chars))
# Add full test corpus
inputs.extend(UNICODE_TEST_STRINGS.values())
return inputs
@pytest.mark.parametrize("user_input", generate_adversarial_inputs("admin"))
def test_sanitizer_handles_adversarial_input(user_input: str) -> None:
# Sanitizer should not crash on adversarial Unicode inputs.
# Should not raise
result = sanitize_user_input(user_input)
# Result should not contain adversarial chars
assert "\\u202E" not in result # No RTL override
assert "\\u200B" not in result # No zero-width space
assert "\\x00" not in result # No null bytes
CI Integration: Making Unicode Tests Non-Optional
The best Unicode tests are the ones that run automatically in CI. Add a dedicated Unicode smoke test step:
# .github/workflows/test.yml
- name: Run Unicode test suite
run: |
python -m pytest tests/test_unicode.py -v --tb=short
env:
PYTHONIOENCODING: utf-8
PYTHONLEGACYWINDOWSSTDIO: "0" # Force UTF-8 on Windows too
# tests/test_unicode.py — the file that runs in CI
import pytest
import unicodedata
class TestMinimumViableUnicodeCompliance:
# Every production app should pass these tests.
def test_nfc_nfd_comparison(self) -> None:
# Critical: NFC and NFD forms must compare equal after normalization.
from your_app.utils import normalize
assert normalize("caf\\u00E9") == normalize("cafe\\u0301")
def test_emoji_not_truncated_mid_sequence(self) -> None:
# Critical: emoji ZWJ sequences must not be split by truncation.
from your_app.utils import truncate
family = "\\U0001F468\\u200D\\U0001F469\\u200D\\U0001F467\\u200D\\U0001F466"
truncated = truncate(f"x{family}", max_chars=2)
assert truncated.encode("utf-8") # No UnicodeEncodeError (broken surrogate)
def test_no_bidi_override_in_output(self) -> None:
# Security: user input bidi overrides must be stripped before output.
from your_app.sanitize import sanitize
attack = "innocent\\u202Etxt"
result = sanitize(attack)
assert "\\u202E" not in result
def test_null_byte_stripped(self) -> None:
# Security: null bytes in user input must not reach the database.
from your_app.sanitize import sanitize
result = sanitize("hello\\x00world")
assert "\\x00" not in result
The goal of Unicode testing is not to test Unicode itself — that's already tested by the ICU library and Python's runtime. The goal is to test your application's handling of Unicode at every point where text crosses a boundary: user input, database storage, API responses, and display. Test the seams, and you'll catch the bugs before your users do.