import re import unicodedata invalid_rendering = ["🕵️", "☝"] # Unicode blocks to strip (symbols, dingbats, emoji ranges) STRIP_BLOCKS_RE = re.compile( '[' '\U0001F600-\U0001F64F' # Emoticons '\U0001F300-\U0001F5FF' # Misc Symbols & Pictographs '\U0001F680-\U0001F6FF' # Transport & Map Symbols '\U0001F700-\U0001F77F' # Alchemical Symbols '\U0001F780-\U0001F7FF' # Geometric Shapes Extended '\U0001F800-\U0001F8FF' # Supplemental Arrows-C '\U0001F900-\U0001F9FF' # Supplemental Symbols & Pictographs '\U0001FA00-\U0001FA6F' # Chess Symbols '\U0001FA70-\U0001FAFF' # Symbols & Pictographs Extended-A '\U0001F1E0-\U0001F1FF' # Flags (iOS/regional indicators) '\u2600-\u26FF' # Misc Symbols (☀, ☁, ☂, etc.) '\u2700-\u27BF' # Dingbats (✂, ✈, ✉, ✌, etc.) '\uFE00-\uFE0F' # Variation Selectors '\U000E0100-\U000E01EF' # Variation Selectors Supplement '\U0001F3FB-\U0001F3FF' # Emoji modifiers (skin tones) ']+', flags=re.UNICODE ) # Control characters and zero-width characters to strip STRIP_CONTROL_RE = re.compile( '[' '\x00-\x08' # C0 controls (NUL-BS) '\x0B\x0C' # VT, FF '\x0E-\x1F' # C0 controls (SO-US) '\x7F-\x9F' # DEL and C1 controls '\u200B-\u200F' # Zero-width chars, LRM, RLM, etc. '\u202A-\u202E' # Bidi embedding controls '\u2060-\u206F' # Format chars (word joiner, etc.) '\uFEFF' # BOM / Zero Width NBSP '\uFFF0-\uFFF8' # Specials ']+', flags=re.UNICODE ) # Surrogates and private use areas # Shouldn't appear in valid UTF-8, but strip just in case STRIP_PRIVATE_RE = re.compile( '[' '\uD800-\uDFFF' # Surrogates '\uE000-\uF8FF' # Private Use Area '\uF900-\uFAFF' # CJK Compatibility Ideographs (keep? strip for safety) '\uFE10-\uFE1F' # Vertical Forms '\uFE20-\uFE2F' # Combining Half Marks '\U000F0000-\U000FFFFF' # Supplementary Private Use Area-A '\U00100000-\U0010FFFF' # Supplementary Private Use Area-B ']+', flags=re.UNICODE ) def strip_modifiers(text): def process_characters(text): result = [] i = 0 while i < len(text): char = text[i] category = unicodedata.category(char) if category.startswith(('L', 'N', 'P', 'S')): result.append(char) i += 1 elif category.startswith(('M', 'Sk', 'Cf')) or char in '\u200d\u200c': i += 1 else: result.append(char) i += 1 return ''.join(result) if text == None: return None for char in invalid_rendering: text = text.replace(char, " ") stripped = process_characters(text) stripped = re.sub(r'[\uFE00-\uFE0F]', '', stripped) stripped = re.sub(r'[\U000E0100-\U000E01EF]', '', stripped, flags=re.UNICODE) stripped = re.sub(r'[\U0001F3FB-\U0001F3FF]', '', stripped, flags=re.UNICODE) stripped = re.sub(r'[\u200D\u200C]', '', stripped) stripped = re.sub(r'\r\n?', '\n', stripped) return stripped.strip().replace("\x00", "") def sanitize_name(name): if name is None: return None # Convert to string and normalize to NFKC # NFKC: Compatibility decomposition followed by canonical composition # This handles: ① to 1, Ⅰ to I, etc., while keeping composed forms name = str(name) name = unicodedata.normalize('NFKC', name) # Build result using category-based filtering result = [] for char in name: cat = unicodedata.category(char) cat_prefix = cat[0] if cat else 'C' # Allow letters (L*), numbers (N*), and punctuation (P*) if cat_prefix in ('L', 'N', 'P'): result.append(char) # Allow space separator, normalize to regular space elif cat == 'Zs': result.append(' ') # Convert line/paragraph separators to space elif cat in ('Zl', 'Zp'): result.append(' ') # Allow spacing combining marks (Mc) for Indic, Hebrew, etc. elif cat == 'Mc': result.append(char) # Allow modifier letters (Lm) - e.g., ʰ, ʱ, ː elif cat == 'Lm': result.append(char) # Strip everything else: # - Mn (Nonspacing Mark): diacritics, combining marks (Zalgo) # - Me (Enclosing Mark): enclosing combining marks # - C* (Controls, Format, Surrogates, Private Use, Unassigned) # - S* (Symbols: currency, math, modifiers, other) name = ''.join(result) # Additional block-based stripping for symbols that categories missed name = STRIP_BLOCKS_RE.sub('', name) name = STRIP_CONTROL_RE.sub('', name) name = STRIP_PRIVATE_RE.sub('', name) # Collapse multiple whitespace characters name = re.sub(r'\s+', ' ', name) # Strip leading/trailing whitespace name = name.strip() return name