Merge pull request #13491 from annando/language2
More languages / use profile text as fallback
This commit is contained in:
commit
4376eedb96
|
@ -226,7 +226,8 @@ Called after the language detection. This can be used for alternative language d
|
||||||
`$data` is an array:
|
`$data` is an array:
|
||||||
|
|
||||||
- **text**: The text that is analyzed.
|
- **text**: The text that is analyzed.
|
||||||
- **detected**: (input/output) Array of language codes detected in the related text.
|
- **detected**: (input/output) Array of language codes detected in the related text. The array key is the language code, the array value the probability.
|
||||||
|
- **uri-id**: The Uri-Id of the item.
|
||||||
|
|
||||||
### addon_settings
|
### addon_settings
|
||||||
Called when generating the HTML for the addon settings page.
|
Called when generating the HTML for the addon settings page.
|
||||||
|
|
|
@ -108,7 +108,8 @@ Wird nach der Sprachenerkennung aufgerufen.
|
||||||
Dieser Hook kann dafür verwendet werden, alternative Erkennungsfunktionen einzubinden.
|
Dieser Hook kann dafür verwendet werden, alternative Erkennungsfunktionen einzubinden.
|
||||||
`$data` ist ein Array:
|
`$data` ist ein Array:
|
||||||
'text' => Der analysierte Text.
|
'text' => Der analysierte Text.
|
||||||
'detected' => (Eingabe/Ausgabe) Das Array mit den erkannten Sprachen.
|
'detected' => (Eingabe/Ausgabe) Das Array mit den erkannten Sprachen. Der Sprachcode ist der Array-Schlüssel, der Array-Wert ist der dezimale Wert für die Wahrscheinlichkeit.
|
||||||
|
'uri-id' => Die Uri-Id des Beitrags
|
||||||
|
|
||||||
**'addon_settings'** - wird aufgerufen, wenn die HTML-Ausgabe der Addon-Einstellungsseite generiert wird.
|
**'addon_settings'** - wird aufgerufen, wenn die HTML-Ausgabe der Addon-Einstellungsseite generiert wird.
|
||||||
$b ist die HTML-Ausgabe (String) der Addon-Einstellungsseite vor dem finalen "</form>"-Tag.
|
$b ist die HTML-Ausgabe (String) der Addon-Einstellungsseite vor dem finalen "</form>"-Tag.
|
||||||
|
|
|
@ -230,18 +230,73 @@ class BBCode
|
||||||
{
|
{
|
||||||
DI::profiler()->startRecording('rendering');
|
DI::profiler()->startRecording('rendering');
|
||||||
// Remove pictures in advance to avoid unneeded proxy calls
|
// Remove pictures in advance to avoid unneeded proxy calls
|
||||||
|
$text = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $text);
|
||||||
$text = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $2 ', $text);
|
$text = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $2 ', $text);
|
||||||
$text = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $text);
|
$text = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $text);
|
||||||
|
|
||||||
// Remove attachment
|
// Remove attachment
|
||||||
$text = self::replaceAttachment($text);
|
$text = self::replaceAttachment($text);
|
||||||
|
|
||||||
$naked_text = HTML::toPlaintext(self::convert($text, false, BBCode::EXTERNAL, true), 0, !$keep_urls);
|
$naked_text = HTML::toPlaintext(self::convert($text, false, self::EXTERNAL, true), 0, !$keep_urls);
|
||||||
|
|
||||||
DI::profiler()->stopRecording();
|
DI::profiler()->stopRecording();
|
||||||
return $naked_text;
|
return $naked_text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts text into a format that can be used for the channel search and the language detection.
|
||||||
|
*
|
||||||
|
* @param string $text
|
||||||
|
* @param integer $uri_id
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
public static function toSearchText(string $text, int $uri_id): string
|
||||||
|
{
|
||||||
|
// Removes attachments
|
||||||
|
$text = self::removeAttachment($text);
|
||||||
|
|
||||||
|
// Add images because of possible alt texts
|
||||||
|
if (!empty($uri_id)) {
|
||||||
|
$text = Post\Media::addAttachmentsToBody($uri_id, $text, [Post\Media::IMAGE]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (empty($text)) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove links without a link description
|
||||||
|
$text = preg_replace("~\[url\=.*\]https?:.*\[\/url\]~", ' ', $text);
|
||||||
|
|
||||||
|
// Remove pictures
|
||||||
|
$text = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $text);
|
||||||
|
|
||||||
|
// Replace picture with the alt description
|
||||||
|
$text = preg_replace("/\[img\=.*?\](.*?)\[\/img\]/ism", ' $1 ', $text);
|
||||||
|
|
||||||
|
// Remove the other pictures
|
||||||
|
$text = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $text);
|
||||||
|
|
||||||
|
// Removes mentions, remove links from hashtags
|
||||||
|
$text = preg_replace('/[@!]\[url\=.*?\].*?\[\/url\]/ism', ' ', $text);
|
||||||
|
$text = preg_replace('/[#]\[url\=.*?\](.*?)\[\/url\]/ism', ' #$1 ', $text);
|
||||||
|
$text = preg_replace('/[@!#]?\[url.*?\[\/url\]/ism', ' ', $text);
|
||||||
|
$text = preg_replace("/\[url=[^\[\]]*\](.*)\[\/url\]/Usi", ' $1 ', $text);
|
||||||
|
|
||||||
|
// Convert it to plain text
|
||||||
|
$text = self::toPlaintext($text, false);
|
||||||
|
|
||||||
|
// Remove possibly remaining links
|
||||||
|
$text = preg_replace(Strings::autoLinkRegEx(), '', $text);
|
||||||
|
|
||||||
|
// Remove all unneeded white space
|
||||||
|
do {
|
||||||
|
$oldtext = $text;
|
||||||
|
$text = str_replace([' ', "\n", "\r", '"', '_'], ' ', $text);
|
||||||
|
} while ($oldtext != $text);
|
||||||
|
|
||||||
|
return trim($text);
|
||||||
|
}
|
||||||
|
|
||||||
private static function proxyUrl(string $image, int $simplehtml = self::INTERNAL, int $uriid = 0, string $size = ''): string
|
private static function proxyUrl(string $image, int $simplehtml = self::INTERNAL, int $uriid = 0, string $size = ''): string
|
||||||
{
|
{
|
||||||
// Only send proxied pictures to API and for internal display
|
// Only send proxied pictures to API and for internal display
|
||||||
|
@ -931,7 +986,7 @@ class BBCode
|
||||||
$network = $contact['network'] ?? Protocol::PHANTOM;
|
$network = $contact['network'] ?? Protocol::PHANTOM;
|
||||||
|
|
||||||
$tpl = Renderer::getMarkupTemplate('shared_content.tpl');
|
$tpl = Renderer::getMarkupTemplate('shared_content.tpl');
|
||||||
$text .= BBCode::SHARED_ANCHOR . Renderer::replaceMacros($tpl, [
|
$text .= self::SHARED_ANCHOR . Renderer::replaceMacros($tpl, [
|
||||||
'$profile' => $attributes['profile'],
|
'$profile' => $attributes['profile'],
|
||||||
'$avatar' => $attributes['avatar'],
|
'$avatar' => $attributes['avatar'],
|
||||||
'$author' => $attributes['author'],
|
'$author' => $attributes['author'],
|
||||||
|
@ -1112,6 +1167,7 @@ class BBCode
|
||||||
public static function removeLinks(string $bbcode): string
|
public static function removeLinks(string $bbcode): string
|
||||||
{
|
{
|
||||||
DI::profiler()->startRecording('rendering');
|
DI::profiler()->startRecording('rendering');
|
||||||
|
$bbcode = preg_replace("/\[img\=([0-9]*)x([0-9]*)\](.*?)\[\/img\]/ism", ' ', $bbcode);
|
||||||
$bbcode = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $1 ', $bbcode);
|
$bbcode = preg_replace("/\[img\=(.*?)\](.*?)\[\/img\]/ism", ' $1 ', $bbcode);
|
||||||
$bbcode = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $bbcode);
|
$bbcode = preg_replace("/\[img.*?\[\/img\]/ism", ' ', $bbcode);
|
||||||
|
|
||||||
|
@ -1996,7 +2052,7 @@ class BBCode
|
||||||
{
|
{
|
||||||
DI::profiler()->startRecording('rendering');
|
DI::profiler()->startRecording('rendering');
|
||||||
|
|
||||||
$text = BBCode::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) {
|
$text = self::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) {
|
||||||
$text = preg_replace("/[\s|\n]*\[abstract\].*?\[\/abstract\][\s|\n]*/ism", ' ', $text);
|
$text = preg_replace("/[\s|\n]*\[abstract\].*?\[\/abstract\][\s|\n]*/ism", ' ', $text);
|
||||||
$text = preg_replace("/[\s|\n]*\[abstract=.*?\].*?\[\/abstract][\s|\n]*/ism", ' ', $text);
|
$text = preg_replace("/[\s|\n]*\[abstract=.*?\].*?\[\/abstract][\s|\n]*/ism", ' ', $text);
|
||||||
return $text;
|
return $text;
|
||||||
|
@ -2018,7 +2074,7 @@ class BBCode
|
||||||
DI::profiler()->startRecording('rendering');
|
DI::profiler()->startRecording('rendering');
|
||||||
$addon = strtolower($addon);
|
$addon = strtolower($addon);
|
||||||
|
|
||||||
$abstract = BBCode::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) use ($addon) {
|
$abstract = self::performWithEscapedTags($text, ['code', 'noparse', 'nobb', 'pre'], function ($text) use ($addon) {
|
||||||
if ($addon && preg_match('#\[abstract=' . preg_quote($addon, '#') . '](.*?)\[/abstract]#ism', $text, $matches)) {
|
if ($addon && preg_match('#\[abstract=' . preg_quote($addon, '#') . '](.*?)\[/abstract]#ism', $text, $matches)) {
|
||||||
return $matches[1];
|
return $matches[1];
|
||||||
}
|
}
|
||||||
|
|
|
@ -400,20 +400,33 @@ class L10n
|
||||||
// Additionally some more languages are added to that list that are used in the Fediverse.
|
// Additionally some more languages are added to that list that are used in the Fediverse.
|
||||||
$additional_langs = [
|
$additional_langs = [
|
||||||
'af' => 'Afrikaans',
|
'af' => 'Afrikaans',
|
||||||
|
'az-Latn' => 'azərbaycan dili',
|
||||||
|
'bs-Latn' => 'bosanski jezik',
|
||||||
|
'be' => 'беларуская мова',
|
||||||
|
'bn' => 'বাংলা',
|
||||||
'cy' => 'Cymraeg',
|
'cy' => 'Cymraeg',
|
||||||
'el-monoton' => 'Ελληνικά',
|
'el-monoton' => 'Ελληνικά',
|
||||||
'eu' => 'euskara',
|
'eu' => 'euskara',
|
||||||
'fa' => 'فارسی',
|
'fa' => 'فارسی',
|
||||||
|
'ga' => 'Gaeilge',
|
||||||
'gl' => 'Galego',
|
'gl' => 'Galego',
|
||||||
|
'he' => 'עברית',
|
||||||
'hi' => 'हिन्दी',
|
'hi' => 'हिन्दी',
|
||||||
'hr' => 'Hrvatski',
|
'hr' => 'Hrvatski',
|
||||||
|
'hy' => 'Հայերեն',
|
||||||
'id' => 'bahasa Indonesia',
|
'id' => 'bahasa Indonesia',
|
||||||
|
'jv' => 'Basa Jawa',
|
||||||
|
'ka' => 'ქართული',
|
||||||
'ko' => '한국인',
|
'ko' => '한국인',
|
||||||
'lt' => 'lietuvių',
|
'lt' => 'lietuvių',
|
||||||
'lv' => 'latviešu',
|
'lv' => 'latviešu',
|
||||||
|
'ms-Latn' => 'Bahasa Melayu',
|
||||||
|
'sr-Cyrl' => 'српски језик',
|
||||||
'sk' => 'slovenský',
|
'sk' => 'slovenský',
|
||||||
'sl' => 'Slovenščina',
|
'sl' => 'Slovenščina',
|
||||||
|
'sq' => 'Shqip',
|
||||||
'sw' => 'Kiswahili',
|
'sw' => 'Kiswahili',
|
||||||
|
'ta' => 'தமிழ்',
|
||||||
'th' => 'แบบไทย',
|
'th' => 'แบบไทย',
|
||||||
'tl' => 'Wikang Tagalog',
|
'tl' => 'Wikang Tagalog',
|
||||||
'tr' => 'Türkçe',
|
'tr' => 'Türkçe',
|
||||||
|
|
|
@ -1987,7 +1987,7 @@ class Item
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
$languages = self::getLanguageArray(trim($item['title'] . "\n" . $item['body']), 3);
|
$languages = self::getLanguageArray(trim($item['title'] . "\n" . $item['body']), 3, $item['uri-id'], $item['author-id']);
|
||||||
if (empty($languages)) {
|
if (empty($languages)) {
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
@ -2000,24 +2000,23 @@ class Item
|
||||||
*
|
*
|
||||||
* @param string $body
|
* @param string $body
|
||||||
* @param integer $count
|
* @param integer $count
|
||||||
|
* @param integer $uri_id
|
||||||
|
* @param integer $author_id
|
||||||
* @return array
|
* @return array
|
||||||
*/
|
*/
|
||||||
public static function getLanguageArray(string $body, int $count): array
|
public static function getLanguageArray(string $body, int $count, int $uri_id = 0, int $author_id = 0): array
|
||||||
{
|
{
|
||||||
// Convert attachments to links
|
$naked_body = BBCode::toSearchText($body, $uri_id);
|
||||||
$naked_body = BBCode::removeAttachment($body);
|
|
||||||
if (empty($naked_body)) {
|
if ((count(explode(' ', $naked_body)) < 10) && (mb_strlen($naked_body) < 30) && $author_id) {
|
||||||
return [];
|
$author = Contact::selectFirst(['about'], ['id' => $author_id]);
|
||||||
|
if (!empty($author['about'])) {
|
||||||
|
$about = BBCode::toSearchText($author['about'], 0);
|
||||||
|
$about = self::getDominantLanguage($about);
|
||||||
|
Logger::debug('About field added', ['author' => $author_id, 'body' => $naked_body, 'about' => $about]);
|
||||||
|
$naked_body .= ' ' . $about;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove links and pictures
|
|
||||||
$naked_body = BBCode::removeLinks($naked_body);
|
|
||||||
|
|
||||||
// Convert the title and the body to plain text
|
|
||||||
$naked_body = BBCode::toPlaintext($naked_body);
|
|
||||||
|
|
||||||
// Remove possibly remaining links
|
|
||||||
$naked_body = trim(preg_replace(Strings::autoLinkRegEx(), '', $naked_body));
|
|
||||||
|
|
||||||
if (empty($naked_body)) {
|
if (empty($naked_body)) {
|
||||||
return [];
|
return [];
|
||||||
|
@ -2034,6 +2033,7 @@ class Item
|
||||||
$data = [
|
$data = [
|
||||||
'text' => $naked_body,
|
'text' => $naked_body,
|
||||||
'detected' => $languages,
|
'detected' => $languages,
|
||||||
|
'uri-id' => $uri_id,
|
||||||
];
|
];
|
||||||
|
|
||||||
Hook::callAll('detect_languages', $data);
|
Hook::callAll('detect_languages', $data);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user