diff --git a/docs/discord-bridge.md b/docs/discord-bridge.md index ed41f88..561cccf 100644 --- a/docs/discord-bridge.md +++ b/docs/discord-bridge.md @@ -86,7 +86,8 @@ Preview: `https://api.dicebear.com/7.x/{style}/png?seed=YourName` - **drop** (default): Do not bridge messages that contain profanity (message is dropped). - **censor**: Replace profanity with `****` and bridge the message. - **off**: No filtering; bridge all messages as-is. -- Requires the `better-profanity` package (see `requirements.txt`). If the package is not installed and `filter_profanity` is `drop` or `censor`, a warning is logged and messages are bridged without filtering. +- The filter checks word-based profanity (via `better-profanity` and optional `unidecode` for homoglyphs) and blocked hate symbols (e.g. swastika Unicode 卐/卍). Symbols are replaced with `***`. +- Requires the `better-profanity` package (see `requirements.txt`). If the package is not installed and `filter_profanity` is `drop` or `censor`, a warning is logged and messages are bridged without word filtering; hate symbols are still filtered even without the package. --- diff --git a/docs/telegram-bridge.md b/docs/telegram-bridge.md index 18a997e..888079d 100644 --- a/docs/telegram-bridge.md +++ b/docs/telegram-bridge.md @@ -76,7 +76,7 @@ Send a message on the bridged MeshCore channel — it should appear in the Teleg | `parse_mode` | No | `HTML` (default), `Markdown`, or `MarkdownV2` | | `disable_web_page_preview` | No | `true`/`false` — disable link previews (default: false) | | `max_message_length` | No | 1–4096; truncate longer messages (default: 4096) | -| `filter_profanity` | No | Profanity handling: `drop` (default, do not bridge), `censor` (replace with ****), or `off`. Requires `better-profanity` package. | +| `filter_profanity` | No | Profanity handling: `drop` (default), `censor`, or `off`. Word list via `better-profanity`; hate symbols (e.g. 卐/卍) are always blocked/censored. | \* Either `api_token` in config or `TELEGRAM_BOT_TOKEN` in the environment must be set when the bridge is enabled. diff --git a/modules/profanity_filter.py b/modules/profanity_filter.py index 2a958a9..60441a0 100644 --- a/modules/profanity_filter.py +++ b/modules/profanity_filter.py @@ -5,10 +5,18 @@ Shared profanity filter for bridge services (Discord, Telegram). Uses better-profanity when available; gracefully falls back to no-op if not installed. Uses unidecode when available to normalize Unicode (e.g. homoglyphs) to ASCII so better-profanity can detect them. +Also checks for hate symbols (e.g. swastika Unicode) that word lists do not catch. """ from typing import Optional +# Unicode code points for symbols we treat as profanity (e.g. swastika forms). +# These are checked in addition to better-profanity's word list. +_HATE_SYMBOL_CODEPOINTS = frozenset({ + 0x5350, # 卐 CJK swastika + 0x534D, # 卍 CJK swastika (reversed) +}) + _profanity_available = False _profanity_initialized = False _warned_unavailable = False @@ -27,6 +35,22 @@ except ImportError: unidecode = None # type: ignore +def _has_hate_symbols(text: str) -> bool: + """Return True if text contains any blocked hate-symbol code point.""" + for cp in _HATE_SYMBOL_CODEPOINTS: + if chr(cp) in text: + return True + return False + + +def _replace_hate_symbols(text: str, replacement: str = "***") -> str: + """Replace any hate-symbol code point in text with replacement.""" + result = text + for cp in _HATE_SYMBOL_CODEPOINTS: + result = result.replace(chr(cp), replacement) + return result + + def _normalize_for_profanity(text: str) -> str: """Convert Unicode to ASCII when unidecode is available (catches homoglyph slurs).""" if _unidecode_available and unidecode is not None: @@ -55,6 +79,7 @@ def _ensure_initialized(logger: Optional[object] = None) -> bool: def censor(text: Optional[str], logger: Optional[object] = None) -> str: """ Replace profanity in text with ****. Returns original text if library unavailable. + Hate symbols (e.g. swastika Unicode) are replaced with ***. Args: text: Input string (message or username). @@ -69,6 +94,8 @@ def censor(text: Optional[str], logger: Optional[object] = None) -> str: return str(text) if not text.strip(): return text + # Replace hate symbols first (no dependency on better-profanity) + text = _replace_hate_symbols(text) if not _ensure_initialized(logger): return text normalized = _normalize_for_profanity(text) @@ -77,17 +104,19 @@ def censor(text: Optional[str], logger: Optional[object] = None) -> str: def contains_profanity(text: Optional[str], logger: Optional[object] = None) -> bool: """ - Return True if text contains any word from the profanity wordlist. + Return True if text contains any word from the profanity wordlist or a blocked hate symbol. Args: text: Input string to check. logger: Optional logger for one-time warning when better_profanity is not installed. Returns: - True if profanity detected, False otherwise or if library unavailable. + True if profanity or hate symbol detected, False otherwise or if library unavailable. """ if text is None or not isinstance(text, str) or not text.strip(): return False + if _has_hate_symbols(text): + return True if not _ensure_initialized(logger): return False normalized = _normalize_for_profanity(text) diff --git a/tests/test_profanity_filter.py b/tests/test_profanity_filter.py index 48bab34..411ab9f 100644 --- a/tests/test_profanity_filter.py +++ b/tests/test_profanity_filter.py @@ -36,6 +36,22 @@ class TestProfanityFilterEdgeCases: def test_contains_profanity_non_string_returns_false(self): assert contains_profanity(123) is False + def test_hate_symbol_swastika_detected(self): + """CJK swastika Unicode in text is detected as profanity (no better_profanity needed).""" + assert contains_profanity("\u5350") is True # 卐 + assert contains_profanity("\u534d") is True # 卍 + assert contains_profanity("User\u5350name") is True + assert contains_profanity("Hello \u534d world") is True + + def test_hate_symbol_swastika_censored(self): + """CJK swastika Unicode is replaced with *** (no better_profanity needed).""" + assert censor("\u5350") == "***" + assert censor("\u534d") == "***" + assert censor("User\u5350name") == "User***name" + assert "***" in censor("Hello \u534d world") + assert "\u5350" not in censor("User\u5350name") + assert "\u534d" not in censor("Hello \u534d world") + class TestProfanityFilterWithLibrary: """Tests that require better_profanity to be installed (skip if not).""" @@ -125,3 +141,10 @@ class TestProfanityFilterFallbackWhenLibraryUnavailable: censor("hello", logger=logger) logger.warning.assert_called_once() assert "better-profanity" in logger.warning.call_args[0][0] + + def test_hate_symbol_still_detected_and_censored_when_library_unavailable(self): + """Hate symbols (e.g. swastika) are detected and replaced even when better_profanity is not installed.""" + import modules.profanity_filter as pf + with patch.object(pf, "_profanity_available", False): + assert contains_profanity("\u5350") is True + assert censor("\u5350") == "***"