-
-
Notifications
You must be signed in to change notification settings - Fork 34.4k
gh-130273: Add pure Python implementation of unicodedata.iter_graphemes() #148218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
df60e53
6262980
38db422
70bdb56
5701c0b
e073e06
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,251 @@ | ||||||
| """Pure Python implementation of unicodedata.iter_graphemes(). | ||||||
|
|
||||||
| Uses the extended grapheme cluster rules from Unicode TR29. | ||||||
|
|
||||||
| Property tables are in _py_grapheme_db.py, generated by | ||||||
| Tools/unicode/makeunicodedata.py from the Unicode Character Database. | ||||||
| """ | ||||||
|
|
||||||
| import sys | ||||||
| from bisect import bisect_right | ||||||
|
|
||||||
| from _py_grapheme_db import ( | ||||||
| GCB_RANGES, | ||||||
| GCB_Other, GCB_Prepend, GCB_CR, GCB_LF, GCB_Control, | ||||||
| GCB_Extend, GCB_Regional_Indicator, GCB_SpacingMark, | ||||||
| GCB_L, GCB_V, GCB_T, GCB_LV, GCB_LVT, GCB_ZWJ, | ||||||
| EXT_PICT_RANGES, | ||||||
| INCB_RANGES, | ||||||
| InCB_None, InCB_Linker, InCB_Consonant, InCB_Extend, | ||||||
| ) | ||||||
|
|
||||||
|
|
||||||
| class Segment: | ||||||
| """Represents a grapheme cluster segment within a string.""" | ||||||
|
|
||||||
| __slots__ = ('_string', 'start', 'end') | ||||||
|
|
||||||
| def __init__(self, string, start, end): | ||||||
| self._string = string | ||||||
| self.start = start | ||||||
| self.end = end | ||||||
|
|
||||||
| def __str__(self): | ||||||
| return self._string[self.start:self.end] | ||||||
|
|
||||||
| def __repr__(self): | ||||||
| return f"<Segment {self.start}:{self.end}>" | ||||||
|
|
||||||
|
|
||||||
| # Extended Pictographic FSM states (for GB11) | ||||||
| _EP_INIT = 0 | ||||||
| _EP_STARTED = 1 | ||||||
| _EP_ZWJ = 2 | ||||||
| _EP_MATCHED = 3 | ||||||
|
|
||||||
| # Indic Conjunct Break FSM states (for GB9c) | ||||||
| _INCB_INIT = 0 | ||||||
| _INCB_STARTED = 1 | ||||||
| _INCB_LINKER = 2 | ||||||
| _INCB_MATCHED = 3 | ||||||
|
|
||||||
| # Hangul syllable constants | ||||||
| _HANGUL_S_BASE = 0xAC00 | ||||||
| _HANGUL_S_COUNT = 11172 | ||||||
| _HANGUL_T_COUNT = 28 | ||||||
|
|
||||||
| # Precomputed start arrays for bisect lookup | ||||||
| _GCB_STARTS = tuple(r[0] for r in GCB_RANGES) | ||||||
| _EXT_PICT_STARTS = tuple(r[0] for r in EXT_PICT_RANGES) | ||||||
| _INCB_STARTS = tuple(r[0] for r in INCB_RANGES) | ||||||
|
|
||||||
|
|
||||||
| # --------------------------------------------------------------------------- | ||||||
| # Property lookup functions | ||||||
| # --------------------------------------------------------------------------- | ||||||
|
|
||||||
| def _get_gcb(cp): | ||||||
| """Return the Grapheme_Cluster_Break value for a codepoint.""" | ||||||
| idx = bisect_right(_GCB_STARTS, cp) - 1 | ||||||
| if idx >= 0 and cp <= GCB_RANGES[idx][1]: | ||||||
| return GCB_RANGES[idx][2] | ||||||
| # Hangul syllables: LV if (cp - S_BASE) % T_COUNT == 0, else LVT | ||||||
| if _HANGUL_S_BASE <= cp < _HANGUL_S_BASE + _HANGUL_S_COUNT: | ||||||
| if (cp - _HANGUL_S_BASE) % _HANGUL_T_COUNT == 0: | ||||||
| return GCB_LV | ||||||
| return GCB_LVT | ||||||
| return GCB_Other | ||||||
|
|
||||||
|
|
||||||
| def _get_ext_pict(cp): | ||||||
| """Return True if the codepoint has the Extended_Pictographic property.""" | ||||||
| idx = bisect_right(_EXT_PICT_STARTS, cp) - 1 | ||||||
| return idx >= 0 and cp <= EXT_PICT_RANGES[idx][1] | ||||||
|
|
||||||
|
|
||||||
| def _get_incb(cp): | ||||||
| """Return the Indic_Conjunct_Break value for a codepoint.""" | ||||||
| idx = bisect_right(_INCB_STARTS, cp) - 1 | ||||||
| if idx >= 0: | ||||||
| entry = INCB_RANGES[idx] | ||||||
| if cp <= entry[1]: | ||||||
| return entry[2] | ||||||
| return InCB_None | ||||||
|
|
||||||
|
|
||||||
| # --------------------------------------------------------------------------- | ||||||
| # Grapheme break algorithm (TR29) | ||||||
| # --------------------------------------------------------------------------- | ||||||
|
|
||||||
| def _grapheme_break(prev_gcb, curr_gcb, ep_state, ri_flag, incb_state): | ||||||
| """Return True if a grapheme cluster break occurs between two characters.""" | ||||||
| # GB3: Do not break between a CR and LF. | ||||||
| if prev_gcb == GCB_CR and curr_gcb == GCB_LF: | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can use Also, I think that for Python implementation it would be more efficient to use a precomputed mapping (dict or 2-level array) |
||||||
| return False | ||||||
|
|
||||||
| # GB4: Break after controls. | ||||||
| if prev_gcb == GCB_CR or prev_gcb == GCB_LF or prev_gcb == GCB_Control: | ||||||
| return True | ||||||
|
|
||||||
| # GB5: Break before controls. | ||||||
| if curr_gcb == GCB_CR or curr_gcb == GCB_LF or curr_gcb == GCB_Control: | ||||||
| return True | ||||||
|
|
||||||
| # GB6: Do not break Hangul syllable sequences (L). | ||||||
| if prev_gcb == GCB_L and ( | ||||||
| curr_gcb == GCB_L or curr_gcb == GCB_V | ||||||
| or curr_gcb == GCB_LV or curr_gcb == GCB_LVT | ||||||
| ): | ||||||
| return False | ||||||
|
|
||||||
| # GB7: Do not break Hangul syllable sequences (LV, V). | ||||||
| if (prev_gcb == GCB_LV or prev_gcb == GCB_V) and ( | ||||||
| curr_gcb == GCB_V or curr_gcb == GCB_T | ||||||
| ): | ||||||
| return False | ||||||
|
|
||||||
| # GB8: Do not break Hangul syllable sequences (LVT, T). | ||||||
| if (prev_gcb == GCB_LVT or prev_gcb == GCB_T) and curr_gcb == GCB_T: | ||||||
| return False | ||||||
|
|
||||||
| # GB9: Do not break before extending characters or ZWJ. | ||||||
| if curr_gcb == GCB_Extend or curr_gcb == GCB_ZWJ: | ||||||
| return False | ||||||
|
|
||||||
| # GB9a: Do not break before SpacingMarks. | ||||||
| if curr_gcb == GCB_SpacingMark: | ||||||
| return False | ||||||
|
|
||||||
| # GB9b: Do not break after Prepend characters. | ||||||
| if prev_gcb == GCB_Prepend: | ||||||
| return False | ||||||
|
|
||||||
| # GB9c: Do not break within Indic conjunct clusters. | ||||||
| if incb_state == _INCB_MATCHED: | ||||||
| return False | ||||||
|
|
||||||
| # GB11: Do not break within emoji ZWJ sequences. | ||||||
| if ep_state == _EP_MATCHED: | ||||||
| return False | ||||||
|
|
||||||
| # GB12/GB13: Do not break within emoji flag sequences. | ||||||
| if prev_gcb == GCB_Regional_Indicator and curr_gcb == GCB_Regional_Indicator: | ||||||
| return ri_flag | ||||||
|
|
||||||
| # GB999: Otherwise, break everywhere. | ||||||
| return True | ||||||
|
|
||||||
|
|
||||||
| # --------------------------------------------------------------------------- | ||||||
| # Public API | ||||||
| # --------------------------------------------------------------------------- | ||||||
|
|
||||||
| def iter_graphemes(string, /, start=0, end=sys.maxsize): | ||||||
| """Iterate over grapheme clusters in a string. | ||||||
|
|
||||||
| Uses extended grapheme cluster rules from TR29. | ||||||
|
|
||||||
| Returns an iterator yielding Segment objects with start/end attributes | ||||||
| and str() support. | ||||||
| """ | ||||||
| if not isinstance(string, str): | ||||||
| raise TypeError( | ||||||
| "argument must be a unicode character, not " | ||||||
| f"'{type(string).__name__}'" | ||||||
| ) | ||||||
|
|
||||||
| length = len(string) | ||||||
| if end > length: | ||||||
| end = length | ||||||
| if end < 0: | ||||||
| end += length | ||||||
| if end < 0: | ||||||
| end = 0 | ||||||
| if start < 0: | ||||||
| start += length | ||||||
| if start < 0: | ||||||
| start = 0 | ||||||
|
|
||||||
| return _iter_grapheme_clusters(string, start, end) | ||||||
|
|
||||||
|
|
||||||
| def _iter_grapheme_clusters(string, start, end): | ||||||
| gcb = GCB_Other | ||||||
| ep_state = _EP_INIT | ||||||
| incb_state = _INCB_INIT | ||||||
| ri_flag = False | ||||||
|
|
||||||
| cluster_start = start | ||||||
| pos = start | ||||||
| while pos < end: | ||||||
| cp = ord(string[pos]) | ||||||
| curr_gcb = _get_gcb(cp) | ||||||
|
|
||||||
| # Update Extended Pictographic FSM (GB11) | ||||||
| ext_pict = _get_ext_pict(cp) | ||||||
| if ext_pict: | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, we can use a mapping |
||||||
| ep_state = _EP_MATCHED if ep_state == _EP_ZWJ else _EP_STARTED | ||||||
| elif ep_state == _EP_STARTED or ep_state == _EP_MATCHED: | ||||||
| if curr_gcb == GCB_Extend: | ||||||
| ep_state = _EP_STARTED | ||||||
| elif curr_gcb == GCB_ZWJ: | ||||||
| ep_state = _EP_ZWJ | ||||||
| else: | ||||||
| ep_state = _EP_INIT | ||||||
| else: | ||||||
| ep_state = _EP_INIT | ||||||
|
|
||||||
| # Update Regional Indicator flag (GB12/GB13) | ||||||
| ri_flag = not ri_flag if curr_gcb == GCB_Regional_Indicator else False | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
|
||||||
| # Update Indic Conjunct Break FSM (GB9c) | ||||||
| curr_incb = _get_incb(cp) | ||||||
| if curr_incb == InCB_Consonant: | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, we can use a mapping |
||||||
| incb_state = ( | ||||||
| _INCB_MATCHED if incb_state == _INCB_LINKER else _INCB_STARTED | ||||||
| ) | ||||||
| elif incb_state != _INCB_INIT: | ||||||
| if curr_incb == InCB_Extend: | ||||||
| incb_state = ( | ||||||
| _INCB_LINKER if incb_state == _INCB_LINKER else _INCB_STARTED | ||||||
| ) | ||||||
| elif curr_incb == InCB_Linker: | ||||||
| incb_state = _INCB_LINKER | ||||||
| else: | ||||||
| incb_state = _INCB_INIT | ||||||
| else: | ||||||
| incb_state = _INCB_INIT | ||||||
|
|
||||||
| prev_gcb = gcb | ||||||
| gcb = curr_gcb | ||||||
|
|
||||||
| if pos != cluster_start and _grapheme_break( | ||||||
| prev_gcb, curr_gcb, ep_state, ri_flag, incb_state | ||||||
| ): | ||||||
| yield Segment(string, cluster_start, pos) | ||||||
| cluster_start = pos | ||||||
|
|
||||||
| pos += 1 | ||||||
|
|
||||||
| if cluster_start < end: | ||||||
| yield Segment(string, cluster_start, end) | ||||||
Uh oh!
There was an error while loading. Please reload this page.