From f4591b2cc72864cea149bcc7d1f93219a3ea39bf Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 11 Oct 2023 18:38:14 +0000 Subject: [PATCH] The language detection is now done in blocks --- doc/Addons.md | 1 + doc/de/Addons.md | 1 + src/Model/Item.php | 114 +++++++++++++++++++++++++++++++++------------ 3 files changed, 85 insertions(+), 31 deletions(-) diff --git a/doc/Addons.md b/doc/Addons.md index bfccde5ddf..b89a48d26d 100644 --- a/doc/Addons.md +++ b/doc/Addons.md @@ -228,6 +228,7 @@ Called after the language detection. This can be used for alternative language d - **text**: The text that is analyzed. - **detected**: (input/output) Array of language codes detected in the related text. The array key is the language code, the array value the probability. - **uri-id**: The Uri-Id of the item. +- **author-id**: The id of the author contact. ### addon_settings Called when generating the HTML for the addon settings page. diff --git a/doc/de/Addons.md b/doc/de/Addons.md index c61b68b489..0843c103ab 100644 --- a/doc/de/Addons.md +++ b/doc/de/Addons.md @@ -110,6 +110,7 @@ Dieser Hook kann dafür verwendet werden, alternative Erkennungsfunktionen einzu 'text' => Der analysierte Text. 'detected' => (Eingabe/Ausgabe) Das Array mit den erkannten Sprachen. Der Sprachcode ist der Array-Schlüssel, der Array-Wert ist der dezimale Wert für die Wahrscheinlichkeit. 'uri-id' => Die Uri-Id des Beitrags + 'author-id' => Die Contact-id des Autors. **'addon_settings'** - wird aufgerufen, wenn die HTML-Ausgabe der Addon-Einstellungsseite generiert wird. $b ist die HTML-Ausgabe (String) der Addon-Einstellungsseite vor dem finalen ""-Tag. diff --git a/src/Model/Item.php b/src/Model/Item.php index d3ae8aa6c6..55884a802e 100644 --- a/src/Model/Item.php +++ b/src/Model/Item.php @@ -49,6 +49,7 @@ use Friendica\Util\Proxy; use Friendica\Util\Strings; use Friendica\Util\Temporal; use GuzzleHttp\Psr7\Uri; +use IntlChar; use LanguageDetection\Language; class Item @@ -2010,67 +2011,118 @@ class Item */ public static function getLanguageArray(string $body, int $count, int $uri_id = 0, int $author_id = 0): array { - $naked_body = BBCode::toSearchText($body, $uri_id); + $searchtext = BBCode::toSearchText($body, $uri_id); - if ((count(explode(' ', $naked_body)) < 10) && (mb_strlen($naked_body) < 30) && $author_id) { + if ((count(explode(' ', $searchtext)) < 10) && (mb_strlen($searchtext) < 30) && $author_id) { $author = Contact::selectFirst(['about'], ['id' => $author_id]); if (!empty($author['about'])) { $about = BBCode::toSearchText($author['about'], 0); - $about = self::getDominantLanguage($about); - Logger::debug('About field added', ['author' => $author_id, 'body' => $naked_body, 'about' => $about]); - $naked_body .= ' ' . $about; + Logger::debug('About field added', ['author' => $author_id, 'body' => $searchtext, 'about' => $about]); + $searchtext .= ' ' . $about; } } - if (empty($naked_body)) { + if (empty($searchtext)) { return []; } - $naked_body = self::getDominantLanguage($naked_body); - $availableLanguages = DI::l10n()->getAvailableLanguages(true); $availableLanguages = DI::l10n()->convertForLanguageDetection($availableLanguages); $ld = new Language(array_keys($availableLanguages)); - $languages = $ld->detect($naked_body)->limit(0, $count)->close() ?: []; - $data = [ - 'text' => $naked_body, - 'detected' => $languages, - 'uri-id' => $uri_id, - ]; + $result = []; - Hook::callAll('detect_languages', $data); - $languages = $data['detected']; + foreach (self::splitByBlocks($searchtext) as $block) { + $languages = $ld->detect($block)->limit(0, $count)->close() ?: []; - return $languages; + $data = [ + 'text' => $block, + 'detected' => $languages, + 'uri-id' => $uri_id, + 'author-id' => $author_id, + ]; + Hook::callAll('detect_languages', $data); + + foreach ($data['detected'] as $language => $quality) { + $result[$language] = max($result[$language] ?? 0, $quality * (strlen($block) / strlen($searchtext))); + } + } + + arsort($result); + $result = array_slice($result, 0, $count); + + return $result; } /** - * Check if latin or non latin are dominant in the body and only return the dominant one + * Split a string into different unicode blocks + * Currently the text is split into the latin and the non latin part. * * @param string $body - * @return string + * @return array */ - private static function getDominantLanguage(string $body): string + private static function splitByBlocks(string $body): array { - $latin = ''; - $non_latin = ''; + $blocks = []; + $previous_block = 0; + for ($i = 0; $i < mb_strlen($body); $i++) { $character = mb_substr($body, $i, 1); - $ord = mb_ord($character); + $previous = ($i > 0) ? mb_substr($body, $i - 1, 1) : ''; + $next = ($i < mb_strlen($body)) ? mb_substr($body, $i + 1, 1) : ''; - // We add the most common characters to both strings. - if (($ord <= 64) || ($ord >= 91 && $ord <= 96) || ($ord >= 123 && $ord <= 191) || in_array($ord, [215, 247]) || ($ord >= 697 && $ord <= 735) || ($ord > 65535)) { - $latin .= $character; - $non_latin .= $character; - } elseif ($ord < 768) { - $latin .= $character; + if (!IntlChar::isalpha($character)) { + if (($previous != '') && (IntlChar::isalpha($previous))) { + $previous_block = self::getBlockCode($previous); + } + + $block = (($next != '') && IntlChar::isalpha($next)) ? self::getBlockCode($next) : $previous_block; + $blocks[$block] = ($blocks[$block] ?? '') . $character; } else { - $non_latin .= $character; + $block = self::getBlockCode($character); + $blocks[$block] = ($blocks[$block] ?? '') . $character; } } - return (mb_strlen($latin) > mb_strlen($non_latin)) ? $latin : $non_latin; + + foreach (array_keys($blocks) as $key) { + $blocks[$key] = trim($blocks[$key]); + if (empty($blocks[$key])) { + unset($blocks[$key]); + } + } + + return array_values($blocks); + } + + /** + * returns the block code for the given character + * + * @param string $character + * @return integer 0 = no alpha character (blank, signs, emojis, ...), 1 = latin character, 2 = character in every other language + */ + private static function getBlockCode(string $character): int + { + if (!IntlChar::isalpha($character)) { + return 0; + } + return self::isLatin($character) ? 1 : 2; + } + + /** + * Checks if the given character is in one of the latin code blocks + * + * @param string $character + * @return boolean + */ + private static function isLatin(string $character): bool + { + return in_array(IntlChar::getBlockCode($character), [ + IntlChar::BLOCK_CODE_BASIC_LATIN, IntlChar::BLOCK_CODE_LATIN_1_SUPPLEMENT, + IntlChar::BLOCK_CODE_LATIN_EXTENDED_A, IntlChar::BLOCK_CODE_LATIN_EXTENDED_B, + IntlChar::BLOCK_CODE_LATIN_EXTENDED_C, IntlChar::BLOCK_CODE_LATIN_EXTENDED_D, + IntlChar::BLOCK_CODE_LATIN_EXTENDED_E, IntlChar::BLOCK_CODE_LATIN_EXTENDED_ADDITIONAL + ]); } public static function getLanguageMessage(array $item): string