Testing Unicode — The Developer's Unicode Handbook

Most test suites are silently broken for Unicode. They test with ASCII, pass with flying colors, and then fail in production when a German user types their name or a Japanese user submits a form. Building a comprehensive Unicode test corpus isn't glamorous work, but it prevents the class of bugs that only appears after you launch in a new market. This chapter provides concrete test cases and strategies for every Unicode edge case that matters.

The Essential Unicode Test Corpus

Every test suite that processes text should include all of these strings. They are the "minimum viable Unicode corpus" — failing any of these represents a real bug that real users will hit.

# unicode_test_corpus.py
# Copy this into your test suite

UNICODE_TEST_STRINGS = {
    # ASCII baseline
    "ascii_simple": "Hello, World!",
    "ascii_special": "Hello<World>&\"'",

    # Diacritics (The Häagen-Dazs Test)
    "latin_diacritics": "Häagen-Dazs",
    "french_accents": "café au lait, résumé, naïve",
    "german_umlauts": "Üniversität Zürich",
    "nordic": "Ångström, Øresund, Åland",

    # NFC vs NFD (same visual, different bytes)
    "nfc_e_acute": "caf\\u00E9",       # precomposed é
    "nfd_e_acute": "cafe\\u0301",      # e + combining acute

    # CJK (The さくら Test)
    "japanese_hiragana": "さくら",
    "japanese_kanji": "東京都",
    "chinese_simplified": "北京大学",
    "chinese_traditional": "臺灣",
    "korean_hangul": "서울시",

    # RTL (The مرحبا Test)
    "arabic": "\\u0645\\u0631\\u062D\\u0628\\u0627",     # مرحبا
    "hebrew": "\\u05E9\\u05DC\\u05D5\\u05DD",            # שלום
    "mixed_rtl_ltr": "Hello \\u0645\\u0631\\u062D\\u0628\\u0627 World",

    # Emoji (The 👨‍👩‍👧‍👦 Test)
    "simple_emoji": "Hello \\U0001F600",    # 😀
    "emoji_sequence": "\\U0001F1FA\\U0001F1F8",  # 🇺🇸 (flag via regional indicators)
    "zwj_family": "\\U0001F468\\u200D\\U0001F469\\u200D\\U0001F467\\u200D\\U0001F466",
    "skin_tone_modifier": "\\U0001F44B\\U0001F3FD",  # 👋🏽

    # Normalization edge cases (The café Test)
    "cafe_nfc": "caf\\u00E9",
    "cafe_nfd": "cafe\\u0301",

    # Case folding (The ΣΣΣ Test — Greek final sigma)
    "greek_sigma": "\\u03A3\\u03C3\\u03C2",  # ΣσΣ (upper, lower, final)
    "german_sharp_s": "Stra\\u00DFe",         # Straße
    "turkish_dotted_i": "\\u0130stanbul",     # İstanbul

    # Zero-width characters
    "zero_width_space": "Hello\\u200BWorld",   # invisible space between words
    "zero_width_joiner": "\\u0041\\u200D\\u0042",  # A + ZWJ + B
    "bom": "\\uFEFFHello",                     # BOM at start

    # Long strings
    "long_ascii": "a" * 10000,
    "long_unicode": "\\u00E9" * 5000,          # é × 5000
    "long_emoji": "\\U0001F600" * 1000,        # 😀 × 1000 (each is surrogate pair!)

    # Empty and whitespace
    "empty": "",
    "space_only": "   ",
    "unicode_space": "\\u00A0\\u2003\\u2009",   # NBSP, EM SPACE, THIN SPACE
    "newlines": "line1\\nline2\\r\\nline3\\u2028line4",  # LS (Line Separator)

    # Special cases
    "null_byte": "hell\\x00world",
    "lone_surrogate": "\\uD800",               # Unpaired surrogate (invalid in most contexts)
    "noncharacter": "\\uFFFE",                 # Unicode noncharacter
    "pua": "\\uE000",                          # Private Use Area character
    "math_alphanumeric": "\\U0001D400",        # Mathematical Bold A (NFKC → A)
}

Writing Parametrized Unicode Tests

Use parametrize to run the same logic over all corpus strings. This ensures your functions handle every edge case:

import pytest
import unicodedata
from your_app.utils import normalize_text, truncate_text, count_characters

# Test normalization idempotency
@pytest.mark.parametrize("text", list(UNICODE_TEST_STRINGS.values()))
def test_normalize_is_idempotent(text: str) -> None:
    # Normalizing twice should give the same result as normalizing once.
    normalized_once = normalize_text(text)
    normalized_twice = normalize_text(normalized_once)
    assert normalized_once == normalized_twice

# Test truncation doesn't break multibyte sequences
@pytest.mark.parametrize("text", list(UNICODE_TEST_STRINGS.values()))
def test_truncate_produces_valid_unicode(text: str) -> None:
    # Truncated strings should be valid Unicode (no broken surrogates).
    for length in [1, 5, 10, 50]:
        result = truncate_text(text, length)
        # Should not raise UnicodeDecodeError
        result.encode("utf-8")
        # Length should not exceed requested
        assert count_characters(result) <= length

# Test round-trip encoding
@pytest.mark.parametrize("text", list(UNICODE_TEST_STRINGS.values()))
@pytest.mark.parametrize("encoding", ["utf-8", "utf-16", "utf-32"])
def test_encoding_roundtrip(text: str, encoding: str) -> None:
    # Text should survive encode/decode roundtrip for valid strings.
    try:
        encoded = text.encode(encoding)
        decoded = encoded.decode(encoding)
        # NFC normalize both sides for comparison
        assert (unicodedata.normalize("NFC", decoded) ==
                unicodedata.normalize("NFC", text))
    except (UnicodeEncodeError, UnicodeDecodeError):
        pytest.skip(f"String not encodable in {encoding} (lone surrogates, etc.)")

The Specific Named Tests

The Häagen-Dazs Test (Diacritics)

def test_haagen_dazs_stored_and_retrieved() -> None:
    # Brand names with diacritics should survive database round-trips.
    brand = "Häagen-Dazs"
    saved_brand = save_to_db(brand)
    retrieved = get_from_db(saved_brand.id)
    assert retrieved.name == brand  # Not "H?agen-Dazs" or "Haagen-Dazs"

def test_haagen_dazs_searchable() -> None:
    # Searching 'haagen' should find 'Häagen-Dazs'.
    create_brand("Häagen-Dazs")
    results = search_brands("haagen")
    assert len(results) == 1
    assert results[0].name == "Häagen-Dazs"

The ZWJ Family Emoji Test

def test_family_emoji_length() -> None:
    # ZWJ family emoji should count as 1 character in user-facing length.
    import grapheme
    family = "\\U0001F468\\u200D\\U0001F469\\u200D\\U0001F467\\u200D\\U0001F466"

    # Wrong: len() gives 7
    assert len(family) == 7

    # Right: grapheme cluster count gives 1
    assert grapheme.length(family) == 1

def test_family_emoji_truncation() -> None:
    # Truncating at grapheme boundary should not split ZWJ sequences.
    family = "\\U0001F468\\u200D\\U0001F469\\u200D\\U0001F467\\u200D\\U0001F466"
    text = f"Hello {family}!"  # 9 grapheme clusters

    truncated = truncate_to_graphemes(text, 8)
    # Should be "Hello 👨‍👩‍👧‍👦" (8 clusters: H,e,l,l,o, ,family, )
    # NOT "Hello 👨" (split in middle of ZWJ sequence)
    assert grapheme.length(truncated) == 8
    # Truncated result should be valid UTF-8
    truncated.encode("utf-8")

The Normalization Test (café)

def test_cafe_normalization_equality() -> None:
    # NFC and NFD representations of café should compare equal after normalization.
    cafe_nfc = "caf\\u00E9"    # precomposed é
    cafe_nfd = "cafe\\u0301"   # e + combining acute

    # Raw comparison fails — this is the bug we're testing for
    assert cafe_nfc != cafe_nfd  # They ARE different bytes

    # Normalized comparison works
    import unicodedata
    assert (unicodedata.normalize("NFC", cafe_nfc) ==
            unicodedata.normalize("NFC", cafe_nfd))

def test_cafe_stored_normalized() -> None:
    # Regardless of input form, stored form should be normalized.
    cafe_nfc = "caf\\u00E9"
    cafe_nfd = "cafe\\u0301"

    id1 = save_text(cafe_nfc)
    id2 = save_text(cafe_nfd)

    # Both should be stored in same normalized form
    r1 = get_text(id1)
    r2 = get_text(id2)
    assert r1 == r2  # Would fail without normalization at write time

The Greek Sigma Test (ΣΣΣ)

def test_greek_final_sigma_casefold() -> None:
    # Greek final sigma (ς) should casefold to ς, not σ.
    # In Greek, sigma has two lowercase forms:
    # σ (U+03C3) medial sigma — used mid-word
    # ς (U+03C2) final sigma — used at end of word
    # Uppercase Σ lowercases to σ (medial)
    # But case folding handles word boundaries correctly

    word = "\\u0391\\u03B8\\u03AE\\u03BD\\u03B1"  # Αθήνα (Athens)
    assert word.lower() == "αθήνα"

    # casefold handles ß → ss correctly
    assert "STRASSE".casefold() == "strasse"
    assert "Stra\\u00DFe".casefold() == "strasse"

    # Critical: casefold for case-insensitive comparison
    assert "Stra\\u00DFe".casefold() == "STRASSE".casefold()

Property-Based Testing with Hypothesis

Hypothesis generates Unicode strings automatically, including edge cases you wouldn't think to include:

from hypothesis import given, settings
from hypothesis import strategies as st

# Test that your normalize function handles all valid Unicode
@given(st.text())
def test_normalize_never_crashes(text: str) -> None:
    # normalize_text should never raise an exception.
    result = normalize_text(text)
    assert isinstance(result, str)

# Test that encoding is always possible after normalization
@given(st.text())
def test_normalized_text_is_utf8_encodable(text: str) -> None:
    # Normalized text should always be encodable as UTF-8.
    import unicodedata
    normalized = unicodedata.normalize("NFC", text)
    # NFC-normalized text should encode without errors
    # (except for lone surrogates, which Hypothesis can generate)
    try:
        normalized.encode("utf-8")
    except UnicodeEncodeError:
        # Only valid failure: lone surrogates (invalid in NFC)
        assert any(
            unicodedata.category(c) == "Cs"  # Surrogate category
            for c in normalized
        )

# Test that truncation never produces longer strings
@given(st.text(), st.integers(min_value=0, max_value=100))
def test_truncation_never_exceeds_max(text: str, max_len: int) -> None:
    import grapheme
    result = truncate_to_graphemes(text, max_len)
    assert grapheme.length(result) <= max_len

Fuzzing for Unicode Security Issues

For security-sensitive code, use fuzzing to find Unicode injection vectors:

# Generate adversarial Unicode inputs
import random
import unicodedata

ADVERSARIAL_CHARS = [
    "\\u202E",  # RTL override
    "\\u200B",  # Zero width space
    "\\uFEFF",  # BOM
    "\\u0000",  # Null byte
    "\\uFFFE",  # Noncharacter
    "\\uD800",  # Lone surrogate
    "\\u2028",  # Line separator (JS newline)
    "\\u2029",  # Paragraph separator
    "\\u0340",  # Deprecated combining grave (NFC-normalizes)
]

def generate_adversarial_inputs(base: str, count: int = 100) -> list[str]:
    # Generate Unicode adversarial variants of a base string.
    inputs = [base]

    # Insert adversarial chars at random positions
    for _ in range(count):
        chars = list(base)
        position = random.randint(0, len(chars))
        adversarial_char = random.choice(ADVERSARIAL_CHARS)
        chars.insert(position, adversarial_char)
        inputs.append("".join(chars))

    # Add full test corpus
    inputs.extend(UNICODE_TEST_STRINGS.values())

    return inputs

@pytest.mark.parametrize("user_input", generate_adversarial_inputs("admin"))
def test_sanitizer_handles_adversarial_input(user_input: str) -> None:
    # Sanitizer should not crash on adversarial Unicode inputs.
    # Should not raise
    result = sanitize_user_input(user_input)
    # Result should not contain adversarial chars
    assert "\\u202E" not in result   # No RTL override
    assert "\\u200B" not in result   # No zero-width space
    assert "\\x00" not in result     # No null bytes

CI Integration: Making Unicode Tests Non-Optional

The best Unicode tests are the ones that run automatically in CI. Add a dedicated Unicode smoke test step:

# .github/workflows/test.yml
- name: Run Unicode test suite
  run: |
    python -m pytest tests/test_unicode.py -v --tb=short
  env:
    PYTHONIOENCODING: utf-8
    PYTHONLEGACYWINDOWSSTDIO: "0"  # Force UTF-8 on Windows too

# tests/test_unicode.py — the file that runs in CI
import pytest
import unicodedata

class TestMinimumViableUnicodeCompliance:
    # Every production app should pass these tests.

    def test_nfc_nfd_comparison(self) -> None:
        # Critical: NFC and NFD forms must compare equal after normalization.
        from your_app.utils import normalize
        assert normalize("caf\\u00E9") == normalize("cafe\\u0301")

    def test_emoji_not_truncated_mid_sequence(self) -> None:
        # Critical: emoji ZWJ sequences must not be split by truncation.
        from your_app.utils import truncate
        family = "\\U0001F468\\u200D\\U0001F469\\u200D\\U0001F467\\u200D\\U0001F466"
        truncated = truncate(f"x{family}", max_chars=2)
        assert truncated.encode("utf-8")  # No UnicodeEncodeError (broken surrogate)

    def test_no_bidi_override_in_output(self) -> None:
        # Security: user input bidi overrides must be stripped before output.
        from your_app.sanitize import sanitize
        attack = "innocent\\u202Etxt"
        result = sanitize(attack)
        assert "\\u202E" not in result

    def test_null_byte_stripped(self) -> None:
        # Security: null bytes in user input must not reach the database.
        from your_app.sanitize import sanitize
        result = sanitize("hello\\x00world")
        assert "\\x00" not in result

The goal of Unicode testing is not to test Unicode itself — that's already tested by the ICU library and Python's runtime. The goal is to test your application's handling of Unicode at every point where text crosses a boundary: user input, database storage, API responses, and display. Test the seams, and you'll catch the bugs before your users do.