python · ambv · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/Lib/_py_grapheme.py b/Lib/_py_grapheme.py
@@ -0,0 +1,251 @@
+"""Pure Python implementation of unicodedata.iter_graphemes().
+
+Uses the extended grapheme cluster rules from Unicode TR29.
+
+Property tables are in _py_grapheme_db.py, generated by
+Tools/unicode/makeunicodedata.py from the Unicode Character Database.
+"""
+
+import sys
+from bisect import bisect_right
+
+from _py_grapheme_db import (
+    GCB_RANGES,
+    GCB_Other, GCB_Prepend, GCB_CR, GCB_LF, GCB_Control,
+    GCB_Extend, GCB_Regional_Indicator, GCB_SpacingMark,
+    GCB_L, GCB_V, GCB_T, GCB_LV, GCB_LVT, GCB_ZWJ,
+    EXT_PICT_RANGES,
+    INCB_RANGES,
+    InCB_None, InCB_Linker, InCB_Consonant, InCB_Extend,
+)
+
+
+class Segment:
+    """Represents a grapheme cluster segment within a string."""
+
+    __slots__ = ('_string', 'start', 'end')
+
+    def __init__(self, string, start, end):
+        self._string = string
+        self.start = start
+        self.end = end
+
+    def __str__(self):
+        return self._string[self.start:self.end]
+
+    def __repr__(self):
+        return f"<Segment {self.start}:{self.end}>"
+
+
+# Extended Pictographic FSM states (for GB11)
+_EP_INIT = 0
+_EP_STARTED = 1
+_EP_ZWJ = 2
+_EP_MATCHED = 3
+
+# Indic Conjunct Break FSM states (for GB9c)
+_INCB_INIT = 0
+_INCB_STARTED = 1
+_INCB_LINKER = 2
+_INCB_MATCHED = 3
+
+# Hangul syllable constants
+_HANGUL_S_BASE = 0xAC00
+_HANGUL_S_COUNT = 11172
+_HANGUL_T_COUNT = 28
+
+# Precomputed start arrays for bisect lookup
+_GCB_STARTS = tuple(r[0] for r in GCB_RANGES)
+_EXT_PICT_STARTS = tuple(r[0] for r in EXT_PICT_RANGES)
+_INCB_STARTS = tuple(r[0] for r in INCB_RANGES)
+
+
+# ---------------------------------------------------------------------------
+# Property lookup functions
+# ---------------------------------------------------------------------------
+
+def _get_gcb(cp):
+    """Return the Grapheme_Cluster_Break value for a codepoint."""
+    idx = bisect_right(_GCB_STARTS, cp) - 1
+    if idx >= 0 and cp <= GCB_RANGES[idx][1]:
+        return GCB_RANGES[idx][2]
+    # Hangul syllables: LV if (cp - S_BASE) % T_COUNT == 0, else LVT
+    if _HANGUL_S_BASE <= cp < _HANGUL_S_BASE + _HANGUL_S_COUNT:
+        if (cp - _HANGUL_S_BASE) % _HANGUL_T_COUNT == 0:
+            return GCB_LV
+        return GCB_LVT
+    return GCB_Other
+
+
+def _get_ext_pict(cp):
+    """Return True if the codepoint has the Extended_Pictographic property."""
+    idx = bisect_right(_EXT_PICT_STARTS, cp) - 1
+    return idx >= 0 and cp <= EXT_PICT_RANGES[idx][1]
+
+
+def _get_incb(cp):
+    """Return the Indic_Conjunct_Break value for a codepoint."""
+    idx = bisect_right(_INCB_STARTS, cp) - 1
+    if idx >= 0:
+        entry = INCB_RANGES[idx]
+        if cp <= entry[1]:
+            return entry[2]
+    return InCB_None
+
+
+# ---------------------------------------------------------------------------
+# Grapheme break algorithm (TR29)
+# ---------------------------------------------------------------------------
+
+def _grapheme_break(prev_gcb, curr_gcb, ep_state, ri_flag, incb_state):
+    """Return True if a grapheme cluster break occurs between two characters."""
+    # GB3: Do not break between a CR and LF.
+    if prev_gcb == GCB_CR and curr_gcb == GCB_LF:
+        return False
+
+    # GB4: Break after controls.
+    if prev_gcb == GCB_CR or prev_gcb == GCB_LF or prev_gcb == GCB_Control:
+        return True
+
+    # GB5: Break before controls.
+    if curr_gcb == GCB_CR or curr_gcb == GCB_LF or curr_gcb == GCB_Control:
+        return True
+
+    # GB6: Do not break Hangul syllable sequences (L).
+    if prev_gcb == GCB_L and (
+        curr_gcb == GCB_L or curr_gcb == GCB_V
+        or curr_gcb == GCB_LV or curr_gcb == GCB_LVT
+    ):
+        return False
+
+    # GB7: Do not break Hangul syllable sequences (LV, V).
+    if (prev_gcb == GCB_LV or prev_gcb == GCB_V) and (
+        curr_gcb == GCB_V or curr_gcb == GCB_T
+    ):
+        return False
+
+    # GB8: Do not break Hangul syllable sequences (LVT, T).
+    if (prev_gcb == GCB_LVT or prev_gcb == GCB_T) and curr_gcb == GCB_T:
+        return False
+
+    # GB9: Do not break before extending characters or ZWJ.
+    if curr_gcb == GCB_Extend or curr_gcb == GCB_ZWJ:
+        return False
+
+    # GB9a: Do not break before SpacingMarks.
+    if curr_gcb == GCB_SpacingMark:
+        return False
+
+    # GB9b: Do not break after Prepend characters.
+    if prev_gcb == GCB_Prepend:
+        return False
+
+    # GB9c: Do not break within Indic conjunct clusters.
+    if incb_state == _INCB_MATCHED:
+        return False
+
+    # GB11: Do not break within emoji ZWJ sequences.
+    if ep_state == _EP_MATCHED:
+        return False
+
+    # GB12/GB13: Do not break within emoji flag sequences.
+    if prev_gcb == GCB_Regional_Indicator and curr_gcb == GCB_Regional_Indicator:
+        return ri_flag
+
+    # GB999: Otherwise, break everywhere.
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def iter_graphemes(string, /, start=0, end=sys.maxsize):
+    """Iterate over grapheme clusters in a string.
+
+    Uses extended grapheme cluster rules from TR29.
+
+    Returns an iterator yielding Segment objects with start/end attributes
+    and str() support.
+    """
+    if not isinstance(string, str):
+        raise TypeError(
+            "argument must be a unicode character, not "
+            f"'{type(string).__name__}'"
+        )
+
+    length = len(string)
+    if end > length:
+        end = length
+    if end < 0:
+        end += length
+        if end < 0:
+            end = 0
+    if start < 0:
+        start += length
+        if start < 0:
+            start = 0
+
+    return _iter_grapheme_clusters(string, start, end)
+
+
+def _iter_grapheme_clusters(string, start, end):
+    gcb = GCB_Other
+    ep_state = _EP_INIT
+    incb_state = _INCB_INIT
+    ri_flag = False
+
+    cluster_start = start
+    pos = start
+    while pos < end:
+        cp = ord(string[pos])
+        curr_gcb = _get_gcb(cp)
+
+        # Update Extended Pictographic FSM (GB11)
+        ext_pict = _get_ext_pict(cp)
+        if ext_pict:
+            ep_state = _EP_MATCHED if ep_state == _EP_ZWJ else _EP_STARTED
+        elif ep_state == _EP_STARTED or ep_state == _EP_MATCHED:
+            if curr_gcb == GCB_Extend:
+                ep_state = _EP_STARTED
+            elif curr_gcb == GCB_ZWJ:
+                ep_state = _EP_ZWJ
+            else:
+                ep_state = _EP_INIT
+        else:
+            ep_state = _EP_INIT
+
+        # Update Regional Indicator flag (GB12/GB13)
+        ri_flag = not ri_flag if curr_gcb == GCB_Regional_Indicator else False
-        ri_flag = not ri_flag if curr_gcb == GCB_Regional_Indicator else False
+        ri_flag = curr_gcb == GCB_Regional_Indicator and not ri_flag
-        ri_flag = not ri_flag if curr_gcb == GCB_Regional_Indicator else False
+        ri_flag = curr_gcb == GCB_Regional_Indicator and not ri_flag
+
+        # Update Indic Conjunct Break FSM (GB9c)
+        curr_incb = _get_incb(cp)
+        if curr_incb == InCB_Consonant:
+            incb_state = (
+                _INCB_MATCHED if incb_state == _INCB_LINKER else _INCB_STARTED
+            )
+        elif incb_state != _INCB_INIT:
+            if curr_incb == InCB_Extend:
+                incb_state = (
+                    _INCB_LINKER if incb_state == _INCB_LINKER else _INCB_STARTED
+                )
+            elif curr_incb == InCB_Linker:
+                incb_state = _INCB_LINKER
+            else:
+                incb_state = _INCB_INIT
+        else:
+            incb_state = _INCB_INIT
+
+        prev_gcb = gcb
+        gcb = curr_gcb
+
+        if pos != cluster_start and _grapheme_break(
+            prev_gcb, curr_gcb, ep_state, ri_flag, incb_state
+        ):
+            yield Segment(string, cluster_start, pos)
+            cluster_start = pos
+
+        pos += 1
+
+    if cluster_start < end:
+        yield Segment(string, cluster_start, end)