mirror of
https://github.com/markqvist/NomadNet.git
synced 2026-05-20 13:05:30 +00:00
144 lines
4.9 KiB
Python
144 lines
4.9 KiB
Python
import re
|
||
import unicodedata
|
||
|
||
invalid_rendering = ["🕵️", "☝"]
|
||
|
||
# Unicode blocks to strip (symbols, dingbats, emoji ranges)
|
||
STRIP_BLOCKS_RE = re.compile(
|
||
'['
|
||
'\U0001F600-\U0001F64F' # Emoticons
|
||
'\U0001F300-\U0001F5FF' # Misc Symbols & Pictographs
|
||
'\U0001F680-\U0001F6FF' # Transport & Map Symbols
|
||
'\U0001F700-\U0001F77F' # Alchemical Symbols
|
||
'\U0001F780-\U0001F7FF' # Geometric Shapes Extended
|
||
'\U0001F800-\U0001F8FF' # Supplemental Arrows-C
|
||
'\U0001F900-\U0001F9FF' # Supplemental Symbols & Pictographs
|
||
'\U0001FA00-\U0001FA6F' # Chess Symbols
|
||
'\U0001FA70-\U0001FAFF' # Symbols & Pictographs Extended-A
|
||
'\U0001F1E0-\U0001F1FF' # Flags (iOS/regional indicators)
|
||
'\u2600-\u26FF' # Misc Symbols (☀, ☁, ☂, etc.)
|
||
'\u2700-\u27BF' # Dingbats (✂, ✈, ✉, ✌, etc.)
|
||
'\uFE00-\uFE0F' # Variation Selectors
|
||
'\U000E0100-\U000E01EF' # Variation Selectors Supplement
|
||
'\U0001F3FB-\U0001F3FF' # Emoji modifiers (skin tones)
|
||
']+',
|
||
flags=re.UNICODE
|
||
)
|
||
|
||
# Control characters and zero-width characters to strip
|
||
STRIP_CONTROL_RE = re.compile(
|
||
'['
|
||
'\x00-\x08' # C0 controls (NUL-BS)
|
||
'\x0B\x0C' # VT, FF
|
||
'\x0E-\x1F' # C0 controls (SO-US)
|
||
'\x7F-\x9F' # DEL and C1 controls
|
||
'\u200B-\u200F' # Zero-width chars, LRM, RLM, etc.
|
||
'\u202A-\u202E' # Bidi embedding controls
|
||
'\u2060-\u206F' # Format chars (word joiner, etc.)
|
||
'\uFEFF' # BOM / Zero Width NBSP
|
||
'\uFFF0-\uFFF8' # Specials
|
||
']+',
|
||
flags=re.UNICODE
|
||
)
|
||
|
||
# Surrogates and private use areas
|
||
# Shouldn't appear in valid UTF-8, but strip just in case
|
||
STRIP_PRIVATE_RE = re.compile(
|
||
'['
|
||
'\uD800-\uDFFF' # Surrogates
|
||
'\uE000-\uF8FF' # Private Use Area
|
||
'\uF900-\uFAFF' # CJK Compatibility Ideographs (keep? strip for safety)
|
||
'\uFE10-\uFE1F' # Vertical Forms
|
||
'\uFE20-\uFE2F' # Combining Half Marks
|
||
'\U000F0000-\U000FFFFF' # Supplementary Private Use Area-A
|
||
'\U00100000-\U0010FFFF' # Supplementary Private Use Area-B
|
||
']+',
|
||
flags=re.UNICODE
|
||
)
|
||
|
||
def strip_modifiers(text):
|
||
def process_characters(text):
|
||
result = []
|
||
i = 0
|
||
while i < len(text):
|
||
char = text[i]
|
||
category = unicodedata.category(char)
|
||
|
||
if category.startswith(('L', 'N', 'P', 'S')):
|
||
result.append(char)
|
||
i += 1
|
||
|
||
elif category.startswith(('M', 'Sk', 'Cf')) or char in '\u200d\u200c':
|
||
i += 1
|
||
|
||
else:
|
||
result.append(char)
|
||
i += 1
|
||
|
||
return ''.join(result)
|
||
|
||
if text == None: return None
|
||
|
||
for char in invalid_rendering:
|
||
text = text.replace(char, " ")
|
||
|
||
stripped = process_characters(text)
|
||
stripped = re.sub(r'[\uFE00-\uFE0F]', '', stripped)
|
||
stripped = re.sub(r'[\U000E0100-\U000E01EF]', '', stripped, flags=re.UNICODE)
|
||
stripped = re.sub(r'[\U0001F3FB-\U0001F3FF]', '', stripped, flags=re.UNICODE)
|
||
stripped = re.sub(r'[\u200D\u200C]', '', stripped)
|
||
stripped = re.sub(r'\r\n?', '\n', stripped)
|
||
|
||
return stripped.strip().replace("\x00", "")
|
||
|
||
def sanitize_name(name):
|
||
if name is None: return None
|
||
|
||
# Convert to string and normalize to NFKC
|
||
# NFKC: Compatibility decomposition followed by canonical composition
|
||
# This handles: ① to 1, Ⅰ to I, etc., while keeping composed forms
|
||
name = str(name)
|
||
name = unicodedata.normalize('NFKC', name)
|
||
|
||
# Build result using category-based filtering
|
||
result = []
|
||
for char in name:
|
||
cat = unicodedata.category(char)
|
||
cat_prefix = cat[0] if cat else 'C'
|
||
|
||
# Allow letters (L*), numbers (N*), and punctuation (P*)
|
||
if cat_prefix in ('L', 'N', 'P'): result.append(char)
|
||
|
||
# Allow space separator, normalize to regular space
|
||
elif cat == 'Zs': result.append(' ')
|
||
|
||
# Convert line/paragraph separators to space
|
||
elif cat in ('Zl', 'Zp'): result.append(' ')
|
||
|
||
# Allow spacing combining marks (Mc) for Indic, Hebrew, etc.
|
||
elif cat == 'Mc': result.append(char)
|
||
|
||
# Allow modifier letters (Lm) - e.g., ʰ, ʱ, ː
|
||
elif cat == 'Lm': result.append(char)
|
||
|
||
# Strip everything else:
|
||
# - Mn (Nonspacing Mark): diacritics, combining marks (Zalgo)
|
||
# - Me (Enclosing Mark): enclosing combining marks
|
||
# - C* (Controls, Format, Surrogates, Private Use, Unassigned)
|
||
# - S* (Symbols: currency, math, modifiers, other)
|
||
|
||
name = ''.join(result)
|
||
|
||
# Additional block-based stripping for symbols that categories missed
|
||
name = STRIP_BLOCKS_RE.sub('', name)
|
||
name = STRIP_CONTROL_RE.sub('', name)
|
||
name = STRIP_PRIVATE_RE.sub('', name)
|
||
|
||
# Collapse multiple whitespace characters
|
||
name = re.sub(r'\s+', ' ', name)
|
||
|
||
# Strip leading/trailing whitespace
|
||
name = name.strip()
|
||
|
||
return name
|