From 4dbb7dd3da207b6adac12f4b9c98e609c900d20b Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 28 Aug 2023 15:37:20 +0000 Subject: [PATCH 1/8] Posts without text or only with emojis are now always accepted in the language check --- src/Content/Smilies.php | 13 +++++++++++++ src/Content/Text/BBCode.php | 8 ++------ src/Protocol/Relay.php | 6 ++++++ 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/Content/Smilies.php b/src/Content/Smilies.php index d231797b81..6d07de575c 100644 --- a/src/Content/Smilies.php +++ b/src/Content/Smilies.php @@ -285,4 +285,17 @@ class Smilies return str_replace($matches[0], $t, $matches[0]); } + + /** + * Checks if the body only contains 4 byte unicode characters. + * + * @param string $body + * @return boolean + */ + public static function isEmojiPost(string $body): bool + { + $conv = html_entity_decode(str_replace([' ', "\n", "\r"], '', $body)); + // Emojis are always 4 byte Unicode characters + return (!empty($conv) && (strlen($conv) / mb_strlen($conv) == 4)); + } } diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php index ab7300da18..239e6dfa09 100644 --- a/src/Content/Text/BBCode.php +++ b/src/Content/Text/BBCode.php @@ -1735,12 +1735,8 @@ class BBCode $text = preg_replace("/\[event\-id\](.*?)\[\/event\-id\]/ism", '', $text); } - if (!$for_plaintext && DI::config()->get('system', 'big_emojis') && ($simple_html != self::DIASPORA)) { - $conv = html_entity_decode(str_replace([' ', "\n", "\r"], '', $text)); - // Emojis are always 4 byte Unicode characters - if (!empty($conv) && (strlen($conv) / mb_strlen($conv) == 4)) { - $text = '' . $text . ''; - } + if (!$for_plaintext && DI::config()->get('system', 'big_emojis') && ($simple_html != self::DIASPORA) && Smilies::isEmojiPost($text)) { + $text = '' . $text . ''; } // Handle mentions and hashtag links diff --git a/src/Protocol/Relay.php b/src/Protocol/Relay.php index 2002aa9bb8..8bcced7567 100644 --- a/src/Protocol/Relay.php +++ b/src/Protocol/Relay.php @@ -21,6 +21,7 @@ namespace Friendica\Protocol; +use Friendica\Content\Smilies; use Friendica\Content\Text\BBCode; use Friendica\Core\Logger; use Friendica\Core\Protocol; @@ -157,6 +158,11 @@ class Relay */ public static function isWantedLanguage(string $body) { + if (empty($body) || Smilies::isEmojiPost($body)) { + Logger::debug('Empty body or only emojis', ['body' => $body]); + return true; + } + $languages = []; foreach (Item::getLanguageArray($body, 10) as $language => $reliability) { if ($reliability > 0) { From 9066a6133c5d4c0a5f6a1a8425eb4685130acd45 Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 28 Aug 2023 20:24:20 +0000 Subject: [PATCH 2/8] New function to replace blank characters --- src/Content/Smilies.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Content/Smilies.php b/src/Content/Smilies.php index 6d07de575c..add6281ec3 100644 --- a/src/Content/Smilies.php +++ b/src/Content/Smilies.php @@ -294,7 +294,7 @@ class Smilies */ public static function isEmojiPost(string $body): bool { - $conv = html_entity_decode(str_replace([' ', "\n", "\r"], '', $body)); + $conv = html_entity_decode(preg_replace('#\s#', '', $body)); // Emojis are always 4 byte Unicode characters return (!empty($conv) && (strlen($conv) / mb_strlen($conv) == 4)); } From 7ee07535f5b7bd8fdc68d8c5c7ddae529346e469 Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 28 Aug 2023 20:53:31 +0000 Subject: [PATCH 3/8] Move "html_entity_decode" --- src/Content/Smilies.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Content/Smilies.php b/src/Content/Smilies.php index add6281ec3..0c3397f13e 100644 --- a/src/Content/Smilies.php +++ b/src/Content/Smilies.php @@ -294,7 +294,7 @@ class Smilies */ public static function isEmojiPost(string $body): bool { - $conv = html_entity_decode(preg_replace('#\s#', '', $body)); + $conv = preg_replace('#\s#', '', html_entity_decode($body)); // Emojis are always 4 byte Unicode characters return (!empty($conv) && (strlen($conv) / mb_strlen($conv) == 4)); } From 6ed440718d28da5449e97ae34a8a4bfcae91146f Mon Sep 17 00:00:00 2001 From: Michael Vogel Date: Tue, 29 Aug 2023 04:59:27 +0200 Subject: [PATCH 4/8] Update src/Content/Smilies.php Co-authored-by: Hypolite Petovan --- src/Content/Smilies.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Content/Smilies.php b/src/Content/Smilies.php index 0c3397f13e..b7abbd8e5f 100644 --- a/src/Content/Smilies.php +++ b/src/Content/Smilies.php @@ -294,7 +294,7 @@ class Smilies */ public static function isEmojiPost(string $body): bool { - $conv = preg_replace('#\s#', '', html_entity_decode($body)); + $conv = preg_replace('#\s#u', '', html_entity_decode($body)); // Emojis are always 4 byte Unicode characters return (!empty($conv) && (strlen($conv) / mb_strlen($conv) == 4)); } From 059a1112824e19cb9267c170c890a583c018f874 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Tue, 29 Aug 2023 22:16:09 -0400 Subject: [PATCH 5/8] Add unit tests for Smilies::isEmojiPost - Current implementation is failing tests with emojis including the zero-width-joiner character, encoded on 3 bytes only. --- src/Content/Smilies.php | 3 +- tests/src/Content/SmiliesTest.php | 71 +++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/src/Content/Smilies.php b/src/Content/Smilies.php index b7abbd8e5f..664a4089ce 100644 --- a/src/Content/Smilies.php +++ b/src/Content/Smilies.php @@ -289,11 +289,12 @@ class Smilies /** * Checks if the body only contains 4 byte unicode characters. * - * @param string $body + * @param string $body Possibly-HTML post body * @return boolean */ public static function isEmojiPost(string $body): bool { + // Strips all whitespace $conv = preg_replace('#\s#u', '', html_entity_decode($body)); // Emojis are always 4 byte Unicode characters return (!empty($conv) && (strlen($conv) / mb_strlen($conv) == 4)); diff --git a/tests/src/Content/SmiliesTest.php b/tests/src/Content/SmiliesTest.php index a886f1ac01..38eb743e85 100644 --- a/tests/src/Content/SmiliesTest.php +++ b/tests/src/Content/SmiliesTest.php @@ -72,4 +72,75 @@ class SmiliesTest extends FixtureTest $output = Smilies::replaceFromArray($text, $smilies); self::assertEquals($expected, $output); } + + public function dataIsEmojiPost(): array + { + return [ + 'emoji' => [ + 'expected' => true, + 'body' => '๐Ÿ‘€', + ], + 'emojis' => [ + 'expected' => true, + 'body' => '๐Ÿ‘€๐Ÿคท', + ], + 'emoji+whitespace' => [ + 'expected' => true, + 'body' => ' ๐Ÿ‘€ ', + ], + 'empty' => [ + 'expected' => false, + 'body' => '', + ], + 'whitespace' => [ + 'expected' => false, + 'body' => ' + ', + ], + 'emoji+ASCII' => [ + 'expected' => false, + 'body' => '๐Ÿคทa', + ], + 'HTML entity whitespace' => [ + 'expected' => false, + 'body' => ' ', + ], + 'HTML entity else' => [ + 'expected' => false, + 'body' => '°', + ], + 'emojis+HTML whitespace' => [ + 'expected' => true, + 'body' => '๐Ÿ‘€ ๐Ÿคท', + ], + 'emojis+HTML else' => [ + 'expected' => false, + 'body' => '๐Ÿ‘€<๐Ÿคท', + ], + 'zwj' => [ + 'expected' => true, + 'body' => '๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€', + ], + 'zwj+whitespace' => [ + 'expected' => true, + 'body' => ' ๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€ ', + ], + 'zwj+HTML whitespace' => [ + 'expected' => true, + 'body' => ' ๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€ ', + ], + ]; + } + + /** + * @dataProvider dataIsEmojiPost + * + * @param bool $expected + * @param string $body + * @return void + */ + public function testIsEmojiPost(bool $expected, string $body) + { + $this->assertEquals($expected, Smilies::isEmojiPost($body)); + } } From 64b5f93a6a161e58a7677f92ef92c421ac750ad8 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Tue, 29 Aug 2023 22:17:48 -0400 Subject: [PATCH 6/8] Add FIXME - Current implementation is failing tests with emojis including the zero-width-joiner character, encoded on 3 bytes only. --- src/Content/Smilies.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Content/Smilies.php b/src/Content/Smilies.php index 664a4089ce..8744e19995 100644 --- a/src/Content/Smilies.php +++ b/src/Content/Smilies.php @@ -296,7 +296,7 @@ class Smilies { // Strips all whitespace $conv = preg_replace('#\s#u', '', html_entity_decode($body)); - // Emojis are always 4 byte Unicode characters + // @FIXME Emojis are almost always 4 byte Unicode characters, except when they include the zero-width joiner character, encoded on 3 bytes return (!empty($conv) && (strlen($conv) / mb_strlen($conv) == 4)); } } From d1eb1ec0f4af2fab5db039855c4471c83444c9aa Mon Sep 17 00:00:00 2001 From: Michael Date: Thu, 12 Oct 2023 21:23:08 +0000 Subject: [PATCH 7/8] Use "IntlChar" for the emoji detection --- src/Content/Smilies.php | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/Content/Smilies.php b/src/Content/Smilies.php index 8744e19995..7d3e4073f4 100644 --- a/src/Content/Smilies.php +++ b/src/Content/Smilies.php @@ -296,7 +296,22 @@ class Smilies { // Strips all whitespace $conv = preg_replace('#\s#u', '', html_entity_decode($body)); - // @FIXME Emojis are almost always 4 byte Unicode characters, except when they include the zero-width joiner character, encoded on 3 bytes - return (!empty($conv) && (strlen($conv) / mb_strlen($conv) == 4)); + if (empty($conv)) { + return false; + } + + if (!class_exists('IntlChar')) { + // Most Emojis are 4 byte Unicode characters, so this is a good workaround, when IntlChar does not exist on the system + return strlen($conv) / mb_strlen($conv) == 4; + } + + for ($i = 0; $i < mb_strlen($conv); $i++) { + $character = mb_substr($conv, $i, 1); + + if (\IntlChar::isalnum($character)) { + return false; + } + } + return true; } } From e558a8378391999d3e3e9fc877e9a841662e3a59 Mon Sep 17 00:00:00 2001 From: Michael Date: Fri, 13 Oct 2023 21:55:15 +0000 Subject: [PATCH 8/8] Improved Emoji detection --- src/Content/Smilies.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Content/Smilies.php b/src/Content/Smilies.php index 7d3e4073f4..2e1a6cf19c 100644 --- a/src/Content/Smilies.php +++ b/src/Content/Smilies.php @@ -287,7 +287,7 @@ class Smilies } /** - * Checks if the body only contains 4 byte unicode characters. + * Checks if the body doesn't contain any alphanumeric characters * * @param string $body Possibly-HTML post body * @return boolean @@ -308,7 +308,7 @@ class Smilies for ($i = 0; $i < mb_strlen($conv); $i++) { $character = mb_substr($conv, $i, 1); - if (\IntlChar::isalnum($character)) { + if (\IntlChar::isalnum($character) || \IntlChar::ispunct($character) || \IntlChar::isgraph($character) && (strlen($character) <= 2)) { return false; } }