Files
NomadNet/nomadnet/util.py
T
2026-05-06 18:44:17 +02:00

144 lines
4.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
import unicodedata
invalid_rendering = ["🕵️", ""]
# Unicode blocks to strip (symbols, dingbats, emoji ranges)
STRIP_BLOCKS_RE = re.compile(
'['
'\U0001F600-\U0001F64F' # Emoticons
'\U0001F300-\U0001F5FF' # Misc Symbols & Pictographs
'\U0001F680-\U0001F6FF' # Transport & Map Symbols
'\U0001F700-\U0001F77F' # Alchemical Symbols
'\U0001F780-\U0001F7FF' # Geometric Shapes Extended
'\U0001F800-\U0001F8FF' # Supplemental Arrows-C
'\U0001F900-\U0001F9FF' # Supplemental Symbols & Pictographs
'\U0001FA00-\U0001FA6F' # Chess Symbols
'\U0001FA70-\U0001FAFF' # Symbols & Pictographs Extended-A
'\U0001F1E0-\U0001F1FF' # Flags (iOS/regional indicators)
'\u2600-\u26FF' # Misc Symbols (☀, ☁, ☂, etc.)
'\u2700-\u27BF' # Dingbats (✂, ✈, ✉, ✌, etc.)
'\uFE00-\uFE0F' # Variation Selectors
'\U000E0100-\U000E01EF' # Variation Selectors Supplement
'\U0001F3FB-\U0001F3FF' # Emoji modifiers (skin tones)
']+',
flags=re.UNICODE
)
# Control characters and zero-width characters to strip
STRIP_CONTROL_RE = re.compile(
'['
'\x00-\x08' # C0 controls (NUL-BS)
'\x0B\x0C' # VT, FF
'\x0E-\x1F' # C0 controls (SO-US)
'\x7F-\x9F' # DEL and C1 controls
'\u200B-\u200F' # Zero-width chars, LRM, RLM, etc.
'\u202A-\u202E' # Bidi embedding controls
'\u2060-\u206F' # Format chars (word joiner, etc.)
'\uFEFF' # BOM / Zero Width NBSP
'\uFFF0-\uFFF8' # Specials
']+',
flags=re.UNICODE
)
# Surrogates and private use areas
# Shouldn't appear in valid UTF-8, but strip just in case
STRIP_PRIVATE_RE = re.compile(
'['
'\uD800-\uDFFF' # Surrogates
'\uE000-\uF8FF' # Private Use Area
'\uF900-\uFAFF' # CJK Compatibility Ideographs (keep? strip for safety)
'\uFE10-\uFE1F' # Vertical Forms
'\uFE20-\uFE2F' # Combining Half Marks
'\U000F0000-\U000FFFFF' # Supplementary Private Use Area-A
'\U00100000-\U0010FFFF' # Supplementary Private Use Area-B
']+',
flags=re.UNICODE
)
def strip_modifiers(text):
def process_characters(text):
result = []
i = 0
while i < len(text):
char = text[i]
category = unicodedata.category(char)
if category.startswith(('L', 'N', 'P', 'S')):
result.append(char)
i += 1
elif category.startswith(('M', 'Sk', 'Cf')) or char in '\u200d\u200c':
i += 1
else:
result.append(char)
i += 1
return ''.join(result)
if text == None: return None
for char in invalid_rendering:
text = text.replace(char, " ")
stripped = process_characters(text)
stripped = re.sub(r'[\uFE00-\uFE0F]', '', stripped)
stripped = re.sub(r'[\U000E0100-\U000E01EF]', '', stripped, flags=re.UNICODE)
stripped = re.sub(r'[\U0001F3FB-\U0001F3FF]', '', stripped, flags=re.UNICODE)
stripped = re.sub(r'[\u200D\u200C]', '', stripped)
stripped = re.sub(r'\r\n?', '\n', stripped)
return stripped.strip().replace("\x00", "")
def sanitize_name(name):
if name is None: return None
# Convert to string and normalize to NFKC
# NFKC: Compatibility decomposition followed by canonical composition
# This handles: ① to 1, to I, etc., while keeping composed forms
name = str(name)
name = unicodedata.normalize('NFKC', name)
# Build result using category-based filtering
result = []
for char in name:
cat = unicodedata.category(char)
cat_prefix = cat[0] if cat else 'C'
# Allow letters (L*), numbers (N*), and punctuation (P*)
if cat_prefix in ('L', 'N', 'P'): result.append(char)
# Allow space separator, normalize to regular space
elif cat == 'Zs': result.append(' ')
# Convert line/paragraph separators to space
elif cat in ('Zl', 'Zp'): result.append(' ')
# Allow spacing combining marks (Mc) for Indic, Hebrew, etc.
elif cat == 'Mc': result.append(char)
# Allow modifier letters (Lm) - e.g., ʰ, ʱ, ː
elif cat == 'Lm': result.append(char)
# Strip everything else:
# - Mn (Nonspacing Mark): diacritics, combining marks (Zalgo)
# - Me (Enclosing Mark): enclosing combining marks
# - C* (Controls, Format, Surrogates, Private Use, Unassigned)
# - S* (Symbols: currency, math, modifiers, other)
name = ''.join(result)
# Additional block-based stripping for symbols that categories missed
name = STRIP_BLOCKS_RE.sub('', name)
name = STRIP_CONTROL_RE.sub('', name)
name = STRIP_PRIVATE_RE.sub('', name)
# Collapse multiple whitespace characters
name = re.sub(r'\s+', ' ', name)
# Strip leading/trailing whitespace
name = name.strip()
return name