mirror of
https://github.com/agessaman/meshcore-bot.git
synced 2026-03-30 12:05:38 +00:00
Enhance profanity filtering to include hate symbol detection and censorship
- Updated the profanity filter to check for hate symbols (e.g., swastika Unicode) in addition to word-based profanity, ensuring comprehensive message filtering. - Modified the `censor` function to replace hate symbols with `***`, maintaining functionality even when the `better-profanity` package is unavailable. - Updated documentation in `discord-bridge.md` and `telegram-bridge.md` to reflect the new hate symbol handling features. - Added tests to verify detection and censorship of hate symbols, ensuring robustness of the profanity filter.
This commit is contained in:
@@ -86,7 +86,8 @@ Preview: `https://api.dicebear.com/7.x/{style}/png?seed=YourName`
|
||||
- **drop** (default): Do not bridge messages that contain profanity (message is dropped).
|
||||
- **censor**: Replace profanity with `****` and bridge the message.
|
||||
- **off**: No filtering; bridge all messages as-is.
|
||||
- Requires the `better-profanity` package (see `requirements.txt`). If the package is not installed and `filter_profanity` is `drop` or `censor`, a warning is logged and messages are bridged without filtering.
|
||||
- The filter checks word-based profanity (via `better-profanity` and optional `unidecode` for homoglyphs) and blocked hate symbols (e.g. swastika Unicode 卐/卍). Symbols are replaced with `***`.
|
||||
- Requires the `better-profanity` package (see `requirements.txt`). If the package is not installed and `filter_profanity` is `drop` or `censor`, a warning is logged and messages are bridged without word filtering; hate symbols are still filtered even without the package.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@ Send a message on the bridged MeshCore channel — it should appear in the Teleg
|
||||
| `parse_mode` | No | `HTML` (default), `Markdown`, or `MarkdownV2` |
|
||||
| `disable_web_page_preview` | No | `true`/`false` — disable link previews (default: false) |
|
||||
| `max_message_length` | No | 1–4096; truncate longer messages (default: 4096) |
|
||||
| `filter_profanity` | No | Profanity handling: `drop` (default, do not bridge), `censor` (replace with ****), or `off`. Requires `better-profanity` package. |
|
||||
| `filter_profanity` | No | Profanity handling: `drop` (default), `censor`, or `off`. Word list via `better-profanity`; hate symbols (e.g. 卐/卍) are always blocked/censored. |
|
||||
|
||||
\* Either `api_token` in config or `TELEGRAM_BOT_TOKEN` in the environment must be set when the bridge is enabled.
|
||||
|
||||
|
||||
@@ -5,10 +5,18 @@ Shared profanity filter for bridge services (Discord, Telegram).
|
||||
Uses better-profanity when available; gracefully falls back to no-op if not installed.
|
||||
Uses unidecode when available to normalize Unicode (e.g. homoglyphs) to ASCII so
|
||||
better-profanity can detect them.
|
||||
Also checks for hate symbols (e.g. swastika Unicode) that word lists do not catch.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
# Unicode code points for symbols we treat as profanity (e.g. swastika forms).
|
||||
# These are checked in addition to better-profanity's word list.
|
||||
_HATE_SYMBOL_CODEPOINTS = frozenset({
|
||||
0x5350, # 卐 CJK swastika
|
||||
0x534D, # 卍 CJK swastika (reversed)
|
||||
})
|
||||
|
||||
_profanity_available = False
|
||||
_profanity_initialized = False
|
||||
_warned_unavailable = False
|
||||
@@ -27,6 +35,22 @@ except ImportError:
|
||||
unidecode = None # type: ignore
|
||||
|
||||
|
||||
def _has_hate_symbols(text: str) -> bool:
|
||||
"""Return True if text contains any blocked hate-symbol code point."""
|
||||
for cp in _HATE_SYMBOL_CODEPOINTS:
|
||||
if chr(cp) in text:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _replace_hate_symbols(text: str, replacement: str = "***") -> str:
|
||||
"""Replace any hate-symbol code point in text with replacement."""
|
||||
result = text
|
||||
for cp in _HATE_SYMBOL_CODEPOINTS:
|
||||
result = result.replace(chr(cp), replacement)
|
||||
return result
|
||||
|
||||
|
||||
def _normalize_for_profanity(text: str) -> str:
|
||||
"""Convert Unicode to ASCII when unidecode is available (catches homoglyph slurs)."""
|
||||
if _unidecode_available and unidecode is not None:
|
||||
@@ -55,6 +79,7 @@ def _ensure_initialized(logger: Optional[object] = None) -> bool:
|
||||
def censor(text: Optional[str], logger: Optional[object] = None) -> str:
|
||||
"""
|
||||
Replace profanity in text with ****. Returns original text if library unavailable.
|
||||
Hate symbols (e.g. swastika Unicode) are replaced with ***.
|
||||
|
||||
Args:
|
||||
text: Input string (message or username).
|
||||
@@ -69,6 +94,8 @@ def censor(text: Optional[str], logger: Optional[object] = None) -> str:
|
||||
return str(text)
|
||||
if not text.strip():
|
||||
return text
|
||||
# Replace hate symbols first (no dependency on better-profanity)
|
||||
text = _replace_hate_symbols(text)
|
||||
if not _ensure_initialized(logger):
|
||||
return text
|
||||
normalized = _normalize_for_profanity(text)
|
||||
@@ -77,17 +104,19 @@ def censor(text: Optional[str], logger: Optional[object] = None) -> str:
|
||||
|
||||
def contains_profanity(text: Optional[str], logger: Optional[object] = None) -> bool:
|
||||
"""
|
||||
Return True if text contains any word from the profanity wordlist.
|
||||
Return True if text contains any word from the profanity wordlist or a blocked hate symbol.
|
||||
|
||||
Args:
|
||||
text: Input string to check.
|
||||
logger: Optional logger for one-time warning when better_profanity is not installed.
|
||||
|
||||
Returns:
|
||||
True if profanity detected, False otherwise or if library unavailable.
|
||||
True if profanity or hate symbol detected, False otherwise or if library unavailable.
|
||||
"""
|
||||
if text is None or not isinstance(text, str) or not text.strip():
|
||||
return False
|
||||
if _has_hate_symbols(text):
|
||||
return True
|
||||
if not _ensure_initialized(logger):
|
||||
return False
|
||||
normalized = _normalize_for_profanity(text)
|
||||
|
||||
@@ -36,6 +36,22 @@ class TestProfanityFilterEdgeCases:
|
||||
def test_contains_profanity_non_string_returns_false(self):
|
||||
assert contains_profanity(123) is False
|
||||
|
||||
def test_hate_symbol_swastika_detected(self):
|
||||
"""CJK swastika Unicode in text is detected as profanity (no better_profanity needed)."""
|
||||
assert contains_profanity("\u5350") is True # 卐
|
||||
assert contains_profanity("\u534d") is True # 卍
|
||||
assert contains_profanity("User\u5350name") is True
|
||||
assert contains_profanity("Hello \u534d world") is True
|
||||
|
||||
def test_hate_symbol_swastika_censored(self):
|
||||
"""CJK swastika Unicode is replaced with *** (no better_profanity needed)."""
|
||||
assert censor("\u5350") == "***"
|
||||
assert censor("\u534d") == "***"
|
||||
assert censor("User\u5350name") == "User***name"
|
||||
assert "***" in censor("Hello \u534d world")
|
||||
assert "\u5350" not in censor("User\u5350name")
|
||||
assert "\u534d" not in censor("Hello \u534d world")
|
||||
|
||||
|
||||
class TestProfanityFilterWithLibrary:
|
||||
"""Tests that require better_profanity to be installed (skip if not)."""
|
||||
@@ -125,3 +141,10 @@ class TestProfanityFilterFallbackWhenLibraryUnavailable:
|
||||
censor("hello", logger=logger)
|
||||
logger.warning.assert_called_once()
|
||||
assert "better-profanity" in logger.warning.call_args[0][0]
|
||||
|
||||
def test_hate_symbol_still_detected_and_censored_when_library_unavailable(self):
|
||||
"""Hate symbols (e.g. swastika) are detected and replaced even when better_profanity is not installed."""
|
||||
import modules.profanity_filter as pf
|
||||
with patch.object(pf, "_profanity_available", False):
|
||||
assert contains_profanity("\u5350") is True
|
||||
assert censor("\u5350") == "***"
|
||||
|
||||
Reference in New Issue
Block a user